xref: /titanic_52/usr/src/uts/common/inet/ip/icmp.c (revision 2264ca7f5db194583c672cb5779a67f52bcd92a9)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 /* Copyright (c) 1990 Mentat Inc. */
26 
27 #include <sys/types.h>
28 #include <sys/stream.h>
29 #include <sys/stropts.h>
30 #include <sys/strlog.h>
31 #include <sys/strsun.h>
32 #define	_SUN_TPI_VERSION 2
33 #include <sys/tihdr.h>
34 #include <sys/timod.h>
35 #include <sys/ddi.h>
36 #include <sys/sunddi.h>
37 #include <sys/strsubr.h>
38 #include <sys/cmn_err.h>
39 #include <sys/debug.h>
40 #include <sys/kmem.h>
41 #include <sys/policy.h>
42 #include <sys/priv.h>
43 #include <sys/zone.h>
44 #include <sys/time.h>
45 
46 #include <sys/sockio.h>
47 #include <sys/socket.h>
48 #include <sys/socketvar.h>
49 #include <sys/isa_defs.h>
50 #include <sys/suntpi.h>
51 #include <sys/xti_inet.h>
52 #include <sys/netstack.h>
53 
54 #include <net/route.h>
55 #include <net/if.h>
56 
57 #include <netinet/in.h>
58 #include <netinet/ip6.h>
59 #include <netinet/icmp6.h>
60 #include <inet/common.h>
61 #include <inet/ip.h>
62 #include <inet/ip6.h>
63 #include <inet/proto_set.h>
64 #include <inet/nd.h>
65 #include <inet/optcom.h>
66 #include <inet/snmpcom.h>
67 #include <inet/kstatcom.h>
68 #include <inet/rawip_impl.h>
69 
70 #include <netinet/ip_mroute.h>
71 #include <inet/tcp.h>
72 #include <net/pfkeyv2.h>
73 #include <inet/ipsec_info.h>
74 #include <inet/ipclassifier.h>
75 
76 #include <sys/tsol/label.h>
77 #include <sys/tsol/tnet.h>
78 
79 #include <inet/ip_ire.h>
80 #include <inet/ip_if.h>
81 
82 #include <inet/ip_impl.h>
83 #include <sys/disp.h>
84 
85 /*
86  * Synchronization notes:
87  *
88  * RAWIP is MT and uses the usual kernel synchronization primitives. There is
89  * locks, which is icmp_rwlock. We also use conn_lock when updating things
90  * which affect the IP classifier lookup.
91  * The lock order is icmp_rwlock -> conn_lock.
92  *
93  * The icmp_rwlock:
94  * This protects most of the other fields in the icmp_t. The exact list of
95  * fields which are protected by each of the above locks is documented in
96  * the icmp_t structure definition.
97  *
98  * Plumbing notes:
99  * ICMP is always a device driver. For compatibility with mibopen() code
100  * it is possible to I_PUSH "icmp", but that results in pushing a passthrough
101  * dummy module.
102  */
103 
104 static void	icmp_addr_req(queue_t *q, mblk_t *mp);
105 static void	icmp_tpi_bind(queue_t *q, mblk_t *mp);
106 static int	icmp_bind_proto(conn_t *connp);
107 static int	icmp_build_hdrs(icmp_t *icmp);
108 static void	icmp_capability_req(queue_t *q, mblk_t *mp);
109 static int	icmp_close(queue_t *q, int flags);
110 static void	icmp_tpi_connect(queue_t *q, mblk_t *mp);
111 static void	icmp_tpi_disconnect(queue_t *q, mblk_t *mp);
112 static void	icmp_err_ack(queue_t *q, mblk_t *mp, t_scalar_t t_error,
113 		    int sys_error);
114 static void	icmp_err_ack_prim(queue_t *q, mblk_t *mp, t_scalar_t primitive,
115 		    t_scalar_t t_error, int sys_error);
116 static void	icmp_icmp_error(conn_t *connp, mblk_t *mp);
117 static void	icmp_icmp_error_ipv6(conn_t *connp, mblk_t *mp);
118 static void	icmp_info_req(queue_t *q, mblk_t *mp);
119 static void	icmp_input(void *, mblk_t *, void *);
120 static conn_t 	*icmp_open(int family, cred_t *credp, int *err, int flags);
121 static int	icmp_openv4(queue_t *q, dev_t *devp, int flag, int sflag,
122 		    cred_t *credp);
123 static int	icmp_openv6(queue_t *q, dev_t *devp, int flag, int sflag,
124 		    cred_t *credp);
125 static int	icmp_unitdata_opt_process(queue_t *q, mblk_t *mp,
126 		    int *errorp, void *thisdg_attrs);
127 static boolean_t icmp_opt_allow_udr_set(t_scalar_t level, t_scalar_t name);
128 int		icmp_opt_set(conn_t *connp, uint_t optset_context,
129 		    int level, int name, uint_t inlen,
130 		    uchar_t *invalp, uint_t *outlenp, uchar_t *outvalp,
131 		    void *thisdg_attrs, cred_t *cr);
132 int		icmp_opt_get(conn_t *connp, int level, int name,
133 		    uchar_t *ptr);
134 static int	icmp_param_get(queue_t *q, mblk_t *mp, caddr_t cp, cred_t *cr);
135 static boolean_t icmp_param_register(IDP *ndp, icmpparam_t *icmppa, int cnt);
136 static int	icmp_param_set(queue_t *q, mblk_t *mp, char *value,
137 		    caddr_t cp, cred_t *cr);
138 static int	icmp_snmp_set(queue_t *q, t_scalar_t level, t_scalar_t name,
139 		    uchar_t *ptr, int len);
140 static int	icmp_status_report(queue_t *q, mblk_t *mp, caddr_t cp,
141 		    cred_t *cr);
142 static void	icmp_ud_err(queue_t *q, mblk_t *mp, t_scalar_t err);
143 static void	icmp_tpi_unbind(queue_t *q, mblk_t *mp);
144 static int	icmp_update_label(icmp_t *icmp, mblk_t *mp, ipaddr_t dst);
145 static void	icmp_wput(queue_t *q, mblk_t *mp);
146 static void	icmp_wput_fallback(queue_t *q, mblk_t *mp);
147 static int	raw_ip_send_data_v6(queue_t *q, conn_t *connp, mblk_t *mp,
148 		    sin6_t *sin6, ip6_pkt_t *ipp);
149 static int	raw_ip_send_data_v4(queue_t *q, conn_t *connp, mblk_t *mp,
150 		    ipaddr_t v4dst, ip4_pkt_t *pktinfop);
151 static void	icmp_wput_other(queue_t *q, mblk_t *mp);
152 static void	icmp_wput_iocdata(queue_t *q, mblk_t *mp);
153 static void	icmp_wput_restricted(queue_t *q, mblk_t *mp);
154 
155 static void	*rawip_stack_init(netstackid_t stackid, netstack_t *ns);
156 static void	rawip_stack_fini(netstackid_t stackid, void *arg);
157 
158 static void	*rawip_kstat_init(netstackid_t stackid);
159 static void	rawip_kstat_fini(netstackid_t stackid, kstat_t *ksp);
160 static int	rawip_kstat_update(kstat_t *kp, int rw);
161 static void	rawip_stack_shutdown(netstackid_t stackid, void *arg);
162 static int	rawip_do_getsockname(icmp_t *icmp, struct sockaddr *sa,
163 		    uint_t *salenp);
164 static int	rawip_do_getpeername(icmp_t *icmp, struct sockaddr *sa,
165 		    uint_t *salenp);
166 
167 int		rawip_getsockname(sock_lower_handle_t, struct sockaddr *,
168 		    socklen_t *, cred_t *);
169 int		rawip_getpeername(sock_lower_handle_t, struct sockaddr *,
170 		    socklen_t *, cred_t *);
171 
172 static struct module_info icmp_mod_info =  {
173 	5707, "icmp", 1, INFPSZ, 512, 128
174 };
175 
176 /*
177  * Entry points for ICMP as a device.
178  * We have separate open functions for the /dev/icmp and /dev/icmp6 devices.
179  */
180 static struct qinit icmprinitv4 = {
181 	NULL, NULL, icmp_openv4, icmp_close, NULL, &icmp_mod_info
182 };
183 
184 static struct qinit icmprinitv6 = {
185 	NULL, NULL, icmp_openv6, icmp_close, NULL, &icmp_mod_info
186 };
187 
188 static struct qinit icmpwinit = {
189 	(pfi_t)icmp_wput, NULL, NULL, NULL, NULL, &icmp_mod_info
190 };
191 
192 /* ICMP entry point during fallback */
193 static struct qinit icmp_fallback_sock_winit = {
194 	(pfi_t)icmp_wput_fallback, NULL, NULL, NULL, NULL, &icmp_mod_info
195 };
196 
197 /* For AF_INET aka /dev/icmp */
198 struct streamtab icmpinfov4 = {
199 	&icmprinitv4, &icmpwinit
200 };
201 
202 /* For AF_INET6 aka /dev/icmp6 */
203 struct streamtab icmpinfov6 = {
204 	&icmprinitv6, &icmpwinit
205 };
206 
207 static sin_t	sin_null;	/* Zero address for quick clears */
208 static sin6_t	sin6_null;	/* Zero address for quick clears */
209 
210 /* Default structure copied into T_INFO_ACK messages */
211 static struct T_info_ack icmp_g_t_info_ack = {
212 	T_INFO_ACK,
213 	IP_MAXPACKET,	 /* TSDU_size.  icmp allows maximum size messages. */
214 	T_INVALID,	/* ETSDU_size.  icmp does not support expedited data. */
215 	T_INVALID,	/* CDATA_size. icmp does not support connect data. */
216 	T_INVALID,	/* DDATA_size. icmp does not support disconnect data. */
217 	0,		/* ADDR_size - filled in later. */
218 	0,		/* OPT_size - not initialized here */
219 	IP_MAXPACKET,	/* TIDU_size.  icmp allows maximum size messages. */
220 	T_CLTS,		/* SERV_type.  icmp supports connection-less. */
221 	TS_UNBND,	/* CURRENT_state.  This is set from icmp_state. */
222 	(XPG4_1|SENDZERO) /* PROVIDER_flag */
223 };
224 
225 /*
226  * Table of ND variables supported by icmp.  These are loaded into is_nd
227  * when the stack instance is created.
228  * All of these are alterable, within the min/max values given, at run time.
229  */
230 static icmpparam_t	icmp_param_arr[] = {
231 	/* min	max	value	name */
232 	{ 0,	128,	32,	"icmp_wroff_extra" },
233 	{ 1,	255,	255,	"icmp_ipv4_ttl" },
234 	{ 0, IPV6_MAX_HOPS, IPV6_DEFAULT_HOPS,	"icmp_ipv6_hoplimit"},
235 	{ 0,	1,	1,	"icmp_bsd_compat" },
236 	{ 4096,	65536,	8192,	"icmp_xmit_hiwat"},
237 	{ 0,	65536,	1024,	"icmp_xmit_lowat"},
238 	{ 4096,	65536,	8192,	"icmp_recv_hiwat"},
239 	{ 65536, 1024*1024*1024, 256*1024,	"icmp_max_buf"},
240 };
241 #define	is_wroff_extra			is_param_arr[0].icmp_param_value
242 #define	is_ipv4_ttl			is_param_arr[1].icmp_param_value
243 #define	is_ipv6_hoplimit		is_param_arr[2].icmp_param_value
244 #define	is_bsd_compat			is_param_arr[3].icmp_param_value
245 #define	is_xmit_hiwat			is_param_arr[4].icmp_param_value
246 #define	is_xmit_lowat			is_param_arr[5].icmp_param_value
247 #define	is_recv_hiwat			is_param_arr[6].icmp_param_value
248 #define	is_max_buf			is_param_arr[7].icmp_param_value
249 
250 static int rawip_do_bind(conn_t *connp, struct sockaddr *sa, socklen_t len);
251 static int rawip_do_connect(conn_t *connp, const struct sockaddr *sa,
252     socklen_t len, cred_t *cr);
253 static void rawip_post_ip_bind_connect(icmp_t *icmp, mblk_t *ire_mp, int error);
254 
255 /*
256  * This routine is called to handle each O_T_BIND_REQ/T_BIND_REQ message
257  * passed to icmp_wput.
258  * The O_T_BIND_REQ/T_BIND_REQ is passed downstream to ip with the ICMP
259  * protocol type placed in the message following the address. A T_BIND_ACK
260  * message is returned by ip_bind_v4/v6.
261  */
262 static void
263 icmp_tpi_bind(queue_t *q, mblk_t *mp)
264 {
265 	int	error;
266 	struct sockaddr *sa;
267 	struct T_bind_req *tbr;
268 	socklen_t	len;
269 	sin_t	*sin;
270 	sin6_t	*sin6;
271 	icmp_t		*icmp;
272 	conn_t	*connp = Q_TO_CONN(q);
273 	mblk_t *mp1;
274 	cred_t *cr;
275 
276 	/*
277 	 * All Solaris components should pass a db_credp
278 	 * for this TPI message, hence we ASSERT.
279 	 * But in case there is some other M_PROTO that looks
280 	 * like a TPI message sent by some other kernel
281 	 * component, we check and return an error.
282 	 */
283 	cr = msg_getcred(mp, NULL);
284 	ASSERT(cr != NULL);
285 	if (cr == NULL) {
286 		icmp_err_ack(q, mp, TSYSERR, EINVAL);
287 		return;
288 	}
289 
290 	icmp = connp->conn_icmp;
291 	if ((mp->b_wptr - mp->b_rptr) < sizeof (*tbr)) {
292 		(void) mi_strlog(q, 1, SL_ERROR|SL_TRACE,
293 		    "icmp_bind: bad req, len %u",
294 		    (uint_t)(mp->b_wptr - mp->b_rptr));
295 		icmp_err_ack(q, mp, TPROTO, 0);
296 		return;
297 	}
298 
299 	if (icmp->icmp_state != TS_UNBND) {
300 		(void) mi_strlog(q, 1, SL_ERROR|SL_TRACE,
301 		    "icmp_bind: bad state, %d", icmp->icmp_state);
302 		icmp_err_ack(q, mp, TOUTSTATE, 0);
303 		return;
304 	}
305 
306 	/*
307 	 * Reallocate the message to make sure we have enough room for an
308 	 * address and the protocol type.
309 	 */
310 	mp1 = reallocb(mp, sizeof (struct T_bind_ack) + sizeof (sin6_t) + 1, 1);
311 	if (!mp1) {
312 		icmp_err_ack(q, mp, TSYSERR, ENOMEM);
313 		return;
314 	}
315 	mp = mp1;
316 
317 	/* Reset the message type in preparation for shipping it back. */
318 	DB_TYPE(mp) = M_PCPROTO;
319 	tbr = (struct T_bind_req *)mp->b_rptr;
320 	len = tbr->ADDR_length;
321 	switch (len) {
322 	case 0:	/* request for a generic port */
323 		tbr->ADDR_offset = sizeof (struct T_bind_req);
324 		if (icmp->icmp_family == AF_INET) {
325 			tbr->ADDR_length = sizeof (sin_t);
326 			sin = (sin_t *)&tbr[1];
327 			*sin = sin_null;
328 			sin->sin_family = AF_INET;
329 			mp->b_wptr = (uchar_t *)&sin[1];
330 			sa = (struct sockaddr *)sin;
331 			len = sizeof (sin_t);
332 		} else {
333 			ASSERT(icmp->icmp_family == AF_INET6);
334 			tbr->ADDR_length = sizeof (sin6_t);
335 			sin6 = (sin6_t *)&tbr[1];
336 			*sin6 = sin6_null;
337 			sin6->sin6_family = AF_INET6;
338 			mp->b_wptr = (uchar_t *)&sin6[1];
339 			sa = (struct sockaddr *)sin6;
340 			len = sizeof (sin6_t);
341 		}
342 		break;
343 
344 	case sizeof (sin_t):	/* Complete IPv4 address */
345 		sa = (struct sockaddr *)mi_offset_param(mp, tbr->ADDR_offset,
346 		    sizeof (sin_t));
347 		break;
348 
349 	case sizeof (sin6_t):	/* Complete IPv6 address */
350 		sa = (struct sockaddr *)mi_offset_param(mp,
351 		    tbr->ADDR_offset, sizeof (sin6_t));
352 		break;
353 
354 	default:
355 		(void) mi_strlog(q, 1, SL_ERROR|SL_TRACE,
356 		    "icmp_bind: bad ADDR_length %d", tbr->ADDR_length);
357 		icmp_err_ack(q, mp, TBADADDR, 0);
358 		return;
359 	}
360 
361 	error = rawip_do_bind(connp, sa, len);
362 done:
363 	ASSERT(mp->b_cont == NULL);
364 	if (error != 0) {
365 		if (error > 0) {
366 			icmp_err_ack(q, mp, TSYSERR, error);
367 		} else {
368 			icmp_err_ack(q, mp, -error, 0);
369 		}
370 	} else {
371 		tbr->PRIM_type = T_BIND_ACK;
372 		qreply(q, mp);
373 	}
374 }
375 
376 static int
377 rawip_do_bind(conn_t *connp, struct sockaddr *sa, socklen_t len)
378 {
379 	sin_t		*sin;
380 	sin6_t		*sin6;
381 	icmp_t		*icmp;
382 	int		error = 0;
383 	mblk_t		*ire_mp;
384 
385 
386 	icmp = connp->conn_icmp;
387 
388 	if (sa == NULL || !OK_32PTR((char *)sa)) {
389 		return (EINVAL);
390 	}
391 
392 	/*
393 	 * The state must be TS_UNBND. TPI mandates that users must send
394 	 * TPI primitives only 1 at a time and wait for the response before
395 	 * sending the next primitive.
396 	 */
397 	rw_enter(&icmp->icmp_rwlock, RW_WRITER);
398 	if (icmp->icmp_state != TS_UNBND || icmp->icmp_pending_op != -1) {
399 		error = -TOUTSTATE;
400 		goto done;
401 	}
402 
403 	ASSERT(len != 0);
404 	switch (len) {
405 	case sizeof (sin_t):    /* Complete IPv4 address */
406 		sin = (sin_t *)sa;
407 		if (sin->sin_family != AF_INET ||
408 		    icmp->icmp_family != AF_INET) {
409 			/* TSYSERR, EAFNOSUPPORT */
410 			error = EAFNOSUPPORT;
411 			goto done;
412 		}
413 		break;
414 	case sizeof (sin6_t): /* Complete IPv6 address */
415 		sin6 = (sin6_t *)sa;
416 		if (sin6->sin6_family != AF_INET6 ||
417 		    icmp->icmp_family != AF_INET6) {
418 			/* TSYSERR, EAFNOSUPPORT */
419 			error = EAFNOSUPPORT;
420 			goto done;
421 		}
422 		/* No support for mapped addresses on raw sockets */
423 		if (IN6_IS_ADDR_V4MAPPED(&sin6->sin6_addr)) {
424 			/* TSYSERR, EADDRNOTAVAIL */
425 			error = EADDRNOTAVAIL;
426 			goto done;
427 		}
428 		break;
429 
430 	default:
431 		/* TBADADDR */
432 		error = EADDRNOTAVAIL;
433 		goto done;
434 	}
435 
436 	icmp->icmp_pending_op = T_BIND_REQ;
437 	icmp->icmp_state = TS_IDLE;
438 
439 	/*
440 	 * Copy the source address into our icmp structure.  This address
441 	 * may still be zero; if so, ip will fill in the correct address
442 	 * each time an outbound packet is passed to it.
443 	 * If we are binding to a broadcast or multicast address then
444 	 * rawip_post_ip_bind_connect will clear the source address.
445 	 */
446 
447 	if (icmp->icmp_family == AF_INET) {
448 		ASSERT(sin != NULL);
449 		ASSERT(icmp->icmp_ipversion == IPV4_VERSION);
450 		IN6_IPADDR_TO_V4MAPPED(sin->sin_addr.s_addr,
451 		    &icmp->icmp_v6src);
452 		icmp->icmp_max_hdr_len = IP_SIMPLE_HDR_LENGTH +
453 		    icmp->icmp_ip_snd_options_len;
454 		icmp->icmp_bound_v6src = icmp->icmp_v6src;
455 	} else {
456 		int error;
457 
458 		ASSERT(sin6 != NULL);
459 		ASSERT(icmp->icmp_ipversion == IPV6_VERSION);
460 		icmp->icmp_v6src = sin6->sin6_addr;
461 		icmp->icmp_max_hdr_len = icmp->icmp_sticky_hdrs_len;
462 		icmp->icmp_bound_v6src = icmp->icmp_v6src;
463 
464 		/* Rebuild the header template */
465 		error = icmp_build_hdrs(icmp);
466 		if (error != 0) {
467 			icmp->icmp_pending_op = -1;
468 			/*
469 			 * TSYSERR
470 			 */
471 			goto done;
472 		}
473 	}
474 
475 	ire_mp = NULL;
476 	if (!(V6_OR_V4_INADDR_ANY(icmp->icmp_v6src))) {
477 		/*
478 		 * request an IRE if src not 0 (INADDR_ANY)
479 		 */
480 		ire_mp = allocb(sizeof (ire_t), BPRI_HI);
481 		if (ire_mp == NULL) {
482 			icmp->icmp_pending_op = -1;
483 			error = ENOMEM;
484 			goto done;
485 		}
486 		DB_TYPE(ire_mp) = IRE_DB_REQ_TYPE;
487 	}
488 done:
489 	rw_exit(&icmp->icmp_rwlock);
490 	if (error != 0)
491 		return (error);
492 
493 	if (icmp->icmp_family == AF_INET6) {
494 		error = ip_proto_bind_laddr_v6(connp, &ire_mp, icmp->icmp_proto,
495 		    &sin6->sin6_addr, sin6->sin6_port, B_TRUE);
496 	} else {
497 		error = ip_proto_bind_laddr_v4(connp, &ire_mp, icmp->icmp_proto,
498 		    sin->sin_addr.s_addr, sin->sin_port, B_TRUE);
499 	}
500 	rawip_post_ip_bind_connect(icmp, ire_mp, error);
501 	return (error);
502 }
503 
504 static void
505 rawip_post_ip_bind_connect(icmp_t *icmp, mblk_t *ire_mp, int error)
506 {
507 	rw_enter(&icmp->icmp_rwlock, RW_WRITER);
508 	if (icmp->icmp_state == TS_UNBND) {
509 		/*
510 		 * not yet bound - bind sent by icmp_bind_proto.
511 		 */
512 		rw_exit(&icmp->icmp_rwlock);
513 		return;
514 	}
515 	ASSERT(icmp->icmp_pending_op != -1);
516 	icmp->icmp_pending_op = -1;
517 
518 	if (error != 0) {
519 		if (icmp->icmp_state == TS_DATA_XFER) {
520 			/* Connect failed */
521 			/* Revert back to the bound source */
522 			icmp->icmp_v6src = icmp->icmp_bound_v6src;
523 			icmp->icmp_state = TS_IDLE;
524 			if (icmp->icmp_family == AF_INET6)
525 				(void) icmp_build_hdrs(icmp);
526 		} else {
527 			V6_SET_ZERO(icmp->icmp_v6src);
528 			V6_SET_ZERO(icmp->icmp_bound_v6src);
529 			icmp->icmp_state = TS_UNBND;
530 			if (icmp->icmp_family == AF_INET6)
531 				(void) icmp_build_hdrs(icmp);
532 		}
533 	} else {
534 		if (ire_mp != NULL && ire_mp->b_datap->db_type == IRE_DB_TYPE) {
535 			ire_t *ire;
536 
537 			ire = (ire_t *)ire_mp->b_rptr;
538 			/*
539 			 * If a broadcast/multicast address was bound set
540 			 * the source address to 0.
541 			 * This ensures no datagrams with broadcast address
542 			 * as source address are emitted (which would violate
543 			 * RFC1122 - Hosts requirements)
544 			 * Note: we get IRE_BROADCAST for IPv6
545 			 * to "mark" a multicast local address.
546 			 */
547 
548 
549 			if (ire->ire_type == IRE_BROADCAST &&
550 			    icmp->icmp_state != TS_DATA_XFER) {
551 				/*
552 				 * This was just a local bind to a
553 				 * MC/broadcast addr
554 				 */
555 				V6_SET_ZERO(icmp->icmp_v6src);
556 				if (icmp->icmp_family == AF_INET6)
557 					(void) icmp_build_hdrs(icmp);
558 			}
559 		}
560 
561 	}
562 	rw_exit(&icmp->icmp_rwlock);
563 	if (ire_mp != NULL)
564 		freeb(ire_mp);
565 }
566 
567 /*
568  * Send message to IP to just bind to the protocol.
569  */
570 static int
571 icmp_bind_proto(conn_t *connp)
572 {
573 	icmp_t	*icmp;
574 	int	error;
575 
576 	icmp = connp->conn_icmp;
577 
578 	if (icmp->icmp_family == AF_INET6)
579 		error = ip_proto_bind_laddr_v6(connp, NULL, icmp->icmp_proto,
580 		    &sin6_null.sin6_addr, 0, B_TRUE);
581 	else
582 		error = ip_proto_bind_laddr_v4(connp, NULL, icmp->icmp_proto,
583 		    sin_null.sin_addr.s_addr, 0, B_TRUE);
584 
585 	rawip_post_ip_bind_connect(icmp, NULL, error);
586 	return (error);
587 }
588 
589 static void
590 icmp_tpi_connect(queue_t *q, mblk_t *mp)
591 {
592 	conn_t	*connp = Q_TO_CONN(q);
593 	struct T_conn_req	*tcr;
594 	icmp_t	*icmp;
595 	struct sockaddr *sa;
596 	socklen_t len;
597 	int error;
598 	cred_t *cr;
599 
600 	/*
601 	 * All Solaris components should pass a db_credp
602 	 * for this TPI message, hence we ASSERT.
603 	 * But in case there is some other M_PROTO that looks
604 	 * like a TPI message sent by some other kernel
605 	 * component, we check and return an error.
606 	 */
607 	cr = msg_getcred(mp, NULL);
608 	ASSERT(cr != NULL);
609 	if (cr == NULL) {
610 		icmp_err_ack(q, mp, TSYSERR, EINVAL);
611 		return;
612 	}
613 
614 	icmp = connp->conn_icmp;
615 	tcr = (struct T_conn_req *)mp->b_rptr;
616 	/* Sanity checks */
617 	if ((mp->b_wptr - mp->b_rptr) < sizeof (struct T_conn_req)) {
618 		icmp_err_ack(q, mp, TPROTO, 0);
619 		return;
620 	}
621 
622 	if (tcr->OPT_length != 0) {
623 		icmp_err_ack(q, mp, TBADOPT, 0);
624 		return;
625 	}
626 
627 	len = tcr->DEST_length;
628 
629 	switch (len) {
630 	default:
631 		icmp_err_ack(q, mp, TBADADDR, 0);
632 		return;
633 	case sizeof (sin_t):
634 		sa = (struct sockaddr *)mi_offset_param(mp, tcr->DEST_offset,
635 		    sizeof (sin_t));
636 		break;
637 	case sizeof (sin6_t):
638 		sa = (struct sockaddr *)mi_offset_param(mp,
639 		    tcr->DEST_offset, sizeof (sin6_t));
640 		break;
641 	}
642 
643 	error = proto_verify_ip_addr(icmp->icmp_family, sa, len);
644 	if (error != 0) {
645 		icmp_err_ack(q, mp, TSYSERR, error);
646 		return;
647 	}
648 
649 	error = rawip_do_connect(connp, sa, len, cr);
650 	if (error != 0) {
651 		if (error < 0) {
652 			icmp_err_ack(q, mp, -error, 0);
653 		} else {
654 			icmp_err_ack(q, mp, 0, error);
655 		}
656 	} else {
657 		mblk_t *mp1;
658 
659 		/*
660 		 * We have to send a connection confirmation to
661 		 * keep TLI happy.
662 		 */
663 		if (icmp->icmp_family == AF_INET) {
664 			mp1 = mi_tpi_conn_con(NULL, (char *)sa,
665 			    sizeof (sin_t), NULL, 0);
666 		} else {
667 			ASSERT(icmp->icmp_family == AF_INET6);
668 			mp1 = mi_tpi_conn_con(NULL, (char *)sa,
669 			    sizeof (sin6_t), NULL, 0);
670 		}
671 		if (mp1 == NULL) {
672 			rw_exit(&icmp->icmp_rwlock);
673 			icmp_err_ack(q, mp, TSYSERR, ENOMEM);
674 			return;
675 		}
676 
677 		/*
678 		 * Send ok_ack for T_CONN_REQ
679 		 */
680 		mp = mi_tpi_ok_ack_alloc(mp);
681 		if (mp == NULL) {
682 			/* Unable to reuse the T_CONN_REQ for the ack. */
683 			freemsg(mp1);
684 			icmp_err_ack_prim(q, mp1, T_CONN_REQ, TSYSERR, ENOMEM);
685 			return;
686 		}
687 		putnext(connp->conn_rq, mp);
688 		putnext(connp->conn_rq, mp1);
689 	}
690 }
691 
692 static int
693 rawip_do_connect(conn_t *connp, const struct sockaddr *sa, socklen_t len,
694     cred_t *cr)
695 {
696 	icmp_t	*icmp;
697 	sin_t	*sin;
698 	sin6_t	*sin6;
699 	mblk_t  *ire_mp;
700 	int	error;
701 	ipaddr_t	v4dst;
702 	in6_addr_t	v6dst;
703 
704 	icmp = connp->conn_icmp;
705 
706 	if (sa == NULL || !OK_32PTR((char *)sa)) {
707 		return (EINVAL);
708 	}
709 
710 	ire_mp = allocb(sizeof (ire_t), BPRI_HI);
711 	if (ire_mp == NULL)
712 		return (ENOMEM);
713 	DB_TYPE(ire_mp) = IRE_DB_REQ_TYPE;
714 
715 
716 	ASSERT(sa != NULL && len != 0);
717 
718 	rw_enter(&icmp->icmp_rwlock, RW_WRITER);
719 	if (icmp->icmp_state == TS_UNBND || icmp->icmp_pending_op != -1) {
720 		rw_exit(&icmp->icmp_rwlock);
721 		freeb(ire_mp);
722 		return (-TOUTSTATE);
723 	}
724 
725 	switch (len) {
726 	case sizeof (sin_t):
727 		sin = (sin_t *)sa;
728 
729 		ASSERT(icmp->icmp_family == AF_INET);
730 		ASSERT(icmp->icmp_ipversion == IPV4_VERSION);
731 
732 		v4dst = sin->sin_addr.s_addr;
733 		/*
734 		 * Interpret a zero destination to mean loopback.
735 		 * Update the T_CONN_REQ (sin/sin6) since it is used to
736 		 * generate the T_CONN_CON.
737 		 */
738 		if (v4dst == INADDR_ANY) {
739 			v4dst = htonl(INADDR_LOOPBACK);
740 		}
741 
742 		IN6_IPADDR_TO_V4MAPPED(v4dst, &v6dst);
743 		ASSERT(icmp->icmp_ipversion == IPV4_VERSION);
744 		icmp->icmp_max_hdr_len = IP_SIMPLE_HDR_LENGTH +
745 		    icmp->icmp_ip_snd_options_len;
746 		icmp->icmp_v6dst.sin6_addr = v6dst;
747 		icmp->icmp_v6dst.sin6_family = AF_INET6;
748 		icmp->icmp_v6dst.sin6_flowinfo = 0;
749 		icmp->icmp_v6dst.sin6_port = 0;
750 
751 		/*
752 		 * If the destination address is multicast and
753 		 * an outgoing multicast interface has been set,
754 		 * use the address of that interface as our
755 		 * source address if no source address has been set.
756 		 */
757 		if (V4_PART_OF_V6(icmp->icmp_v6src) == INADDR_ANY &&
758 		    CLASSD(v4dst) &&
759 		    icmp->icmp_multicast_if_addr != INADDR_ANY) {
760 			IN6_IPADDR_TO_V4MAPPED(icmp->icmp_multicast_if_addr,
761 			    &icmp->icmp_v6src);
762 		}
763 		break;
764 	case sizeof (sin6_t):
765 		sin6 = (sin6_t *)sa;
766 
767 		/* No support for mapped addresses on raw sockets */
768 		if (IN6_IS_ADDR_V4MAPPED(&sin6->sin6_addr)) {
769 			rw_exit(&icmp->icmp_rwlock);
770 			freeb(ire_mp);
771 			return (EADDRNOTAVAIL);
772 		}
773 
774 		ASSERT(icmp->icmp_ipversion == IPV6_VERSION);
775 		ASSERT(icmp->icmp_family == AF_INET6);
776 
777 		icmp->icmp_max_hdr_len = icmp->icmp_sticky_hdrs_len;
778 
779 		icmp->icmp_v6dst = *sin6;
780 		icmp->icmp_v6dst.sin6_port = 0;
781 
782 		/*
783 		 * Interpret a zero destination to mean loopback.
784 		 * Update the T_CONN_REQ (sin/sin6) since it is used to
785 		 * generate the T_CONN_CON.
786 		 */
787 		if (IN6_IS_ADDR_UNSPECIFIED(&icmp->icmp_v6dst.sin6_addr)) {
788 			icmp->icmp_v6dst.sin6_addr = ipv6_loopback;
789 		}
790 		/*
791 		 * If the destination address is multicast and
792 		 * an outgoing multicast interface has been set,
793 		 * then the ip bind logic will pick the correct source
794 		 * address (i.e. matching the outgoing multicast interface).
795 		 */
796 		break;
797 	}
798 
799 	icmp->icmp_pending_op = T_CONN_REQ;
800 
801 	if (icmp->icmp_state == TS_DATA_XFER) {
802 		/* Already connected - clear out state */
803 		icmp->icmp_v6src = icmp->icmp_bound_v6src;
804 		icmp->icmp_state = TS_IDLE;
805 	}
806 
807 	icmp->icmp_state = TS_DATA_XFER;
808 	rw_exit(&icmp->icmp_rwlock);
809 
810 	if (icmp->icmp_family == AF_INET6) {
811 		error = ip_proto_bind_connected_v6(connp, &ire_mp,
812 		    icmp->icmp_proto, &icmp->icmp_v6src, 0,
813 		    &icmp->icmp_v6dst.sin6_addr,
814 		    NULL, sin6->sin6_port, B_TRUE, B_TRUE, cr);
815 	} else {
816 		error = ip_proto_bind_connected_v4(connp, &ire_mp,
817 		    icmp->icmp_proto, &V4_PART_OF_V6(icmp->icmp_v6src), 0,
818 		    V4_PART_OF_V6(icmp->icmp_v6dst.sin6_addr), sin->sin_port,
819 		    B_TRUE, B_TRUE, cr);
820 	}
821 	rawip_post_ip_bind_connect(icmp, ire_mp, error);
822 	return (error);
823 }
824 
825 static void
826 icmp_close_free(conn_t *connp)
827 {
828 	icmp_t *icmp = connp->conn_icmp;
829 
830 	/* If there are any options associated with the stream, free them. */
831 	if (icmp->icmp_ip_snd_options != NULL) {
832 		mi_free((char *)icmp->icmp_ip_snd_options);
833 		icmp->icmp_ip_snd_options = NULL;
834 		icmp->icmp_ip_snd_options_len = 0;
835 	}
836 
837 	if (icmp->icmp_filter != NULL) {
838 		kmem_free(icmp->icmp_filter, sizeof (icmp6_filter_t));
839 		icmp->icmp_filter = NULL;
840 	}
841 
842 	/* Free memory associated with sticky options */
843 	if (icmp->icmp_sticky_hdrs_len != 0) {
844 		kmem_free(icmp->icmp_sticky_hdrs,
845 		    icmp->icmp_sticky_hdrs_len);
846 		icmp->icmp_sticky_hdrs = NULL;
847 		icmp->icmp_sticky_hdrs_len = 0;
848 	}
849 	ip6_pkt_free(&icmp->icmp_sticky_ipp);
850 
851 	/*
852 	 * Clear any fields which the kmem_cache constructor clears.
853 	 * Only icmp_connp needs to be preserved.
854 	 * TBD: We should make this more efficient to avoid clearing
855 	 * everything.
856 	 */
857 	ASSERT(icmp->icmp_connp == connp);
858 	bzero(icmp, sizeof (icmp_t));
859 	icmp->icmp_connp = connp;
860 }
861 
862 static int
863 rawip_do_close(conn_t *connp)
864 {
865 	ASSERT(connp != NULL && IPCL_IS_RAWIP(connp));
866 
867 	ip_quiesce_conn(connp);
868 
869 	if (!IPCL_IS_NONSTR(connp)) {
870 		qprocsoff(connp->conn_rq);
871 	}
872 
873 	ASSERT(connp->conn_icmp->icmp_fallback_queue_head == NULL &&
874 	    connp->conn_icmp->icmp_fallback_queue_tail == NULL);
875 	icmp_close_free(connp);
876 
877 	/*
878 	 * Now we are truly single threaded on this stream, and can
879 	 * delete the things hanging off the connp, and finally the connp.
880 	 * We removed this connp from the fanout list, it cannot be
881 	 * accessed thru the fanouts, and we already waited for the
882 	 * conn_ref to drop to 0. We are already in close, so
883 	 * there cannot be any other thread from the top. qprocsoff
884 	 * has completed, and service has completed or won't run in
885 	 * future.
886 	 */
887 	ASSERT(connp->conn_ref == 1);
888 
889 	if (!IPCL_IS_NONSTR(connp)) {
890 		inet_minor_free(connp->conn_minor_arena, connp->conn_dev);
891 	} else {
892 		ip_free_helper_stream(connp);
893 	}
894 
895 	connp->conn_ref--;
896 	ipcl_conn_destroy(connp);
897 
898 	return (0);
899 }
900 
901 static int
902 icmp_close(queue_t *q, int flags)
903 {
904 	conn_t  *connp;
905 
906 	if (flags & SO_FALLBACK) {
907 		/*
908 		 * stream is being closed while in fallback
909 		 * simply free the resources that were allocated
910 		 */
911 		inet_minor_free(WR(q)->q_ptr, (dev_t)(RD(q)->q_ptr));
912 		qprocsoff(q);
913 		goto done;
914 	}
915 
916 	connp = Q_TO_CONN(q);
917 	(void) rawip_do_close(connp);
918 done:
919 	q->q_ptr = WR(q)->q_ptr = NULL;
920 	return (0);
921 }
922 
923 /*
924  * This routine handles each T_DISCON_REQ message passed to icmp
925  * as an indicating that ICMP is no longer connected. This results
926  * in sending a T_BIND_REQ to IP to restore the binding to just
927  * the local address.
928  *
929  * The disconnect completes in rawip_post_ip_bind_connect.
930  */
931 static int
932 icmp_do_disconnect(conn_t *connp)
933 {
934 	icmp_t	*icmp;
935 	mblk_t	*ire_mp;
936 	int error;
937 
938 	icmp = connp->conn_icmp;
939 	rw_enter(&icmp->icmp_rwlock, RW_WRITER);
940 	if (icmp->icmp_state != TS_DATA_XFER || icmp->icmp_pending_op != -1) {
941 		rw_exit(&icmp->icmp_rwlock);
942 		return (-TOUTSTATE);
943 	}
944 	icmp->icmp_pending_op = T_DISCON_REQ;
945 	icmp->icmp_v6src = icmp->icmp_bound_v6src;
946 	icmp->icmp_state = TS_IDLE;
947 
948 
949 	if (icmp->icmp_family == AF_INET6) {
950 		/* Rebuild the header template */
951 		error = icmp_build_hdrs(icmp);
952 		if (error != 0) {
953 			icmp->icmp_pending_op = -1;
954 			rw_exit(&icmp->icmp_rwlock);
955 			return (error);
956 		}
957 	}
958 
959 	rw_exit(&icmp->icmp_rwlock);
960 	ire_mp = allocb(sizeof (ire_t), BPRI_HI);
961 	if (ire_mp == NULL) {
962 		return (ENOMEM);
963 	}
964 
965 	if (icmp->icmp_family == AF_INET6) {
966 		error = ip_proto_bind_laddr_v6(connp, &ire_mp, icmp->icmp_proto,
967 		    &icmp->icmp_bound_v6src, 0, B_TRUE);
968 	} else {
969 
970 		error = ip_proto_bind_laddr_v4(connp, &ire_mp, icmp->icmp_proto,
971 		    V4_PART_OF_V6(icmp->icmp_bound_v6src), 0, B_TRUE);
972 	}
973 
974 	rawip_post_ip_bind_connect(icmp, ire_mp, error);
975 
976 	return (error);
977 }
978 
979 static void
980 icmp_tpi_disconnect(queue_t *q, mblk_t *mp)
981 {
982 	conn_t	*connp = Q_TO_CONN(q);
983 	int	error;
984 
985 	/*
986 	 * Allocate the largest primitive we need to send back
987 	 * T_error_ack is > than T_ok_ack
988 	 */
989 	mp = reallocb(mp, sizeof (struct T_error_ack), 1);
990 	if (mp == NULL) {
991 		/* Unable to reuse the T_DISCON_REQ for the ack. */
992 		icmp_err_ack_prim(q, mp, T_DISCON_REQ, TSYSERR, ENOMEM);
993 		return;
994 	}
995 
996 	error = icmp_do_disconnect(connp);
997 
998 	if (error != 0) {
999 		if (error > 0) {
1000 			icmp_err_ack(q, mp, 0, error);
1001 		} else {
1002 			icmp_err_ack(q, mp, -error, 0);
1003 		}
1004 	} else {
1005 		mp = mi_tpi_ok_ack_alloc(mp);
1006 		ASSERT(mp != NULL);
1007 		qreply(q, mp);
1008 	}
1009 
1010 }
1011 
1012 static int
1013 icmp_disconnect(conn_t *connp)
1014 {
1015 	int	error;
1016 	icmp_t	*icmp = connp->conn_icmp;
1017 
1018 	icmp->icmp_dgram_errind = B_FALSE;
1019 
1020 	error = icmp_do_disconnect(connp);
1021 
1022 	if (error < 0)
1023 		error = proto_tlitosyserr(-error);
1024 	return (error);
1025 }
1026 
1027 /* This routine creates a T_ERROR_ACK message and passes it upstream. */
1028 static void
1029 icmp_err_ack(queue_t *q, mblk_t *mp, t_scalar_t t_error, int sys_error)
1030 {
1031 	if ((mp = mi_tpi_err_ack_alloc(mp, t_error, sys_error)) != NULL)
1032 		qreply(q, mp);
1033 }
1034 
1035 /* Shorthand to generate and send TPI error acks to our client */
1036 static void
1037 icmp_err_ack_prim(queue_t *q, mblk_t *mp, t_scalar_t primitive,
1038     t_scalar_t t_error, int sys_error)
1039 {
1040 	struct T_error_ack	*teackp;
1041 
1042 	if ((mp = tpi_ack_alloc(mp, sizeof (struct T_error_ack),
1043 	    M_PCPROTO, T_ERROR_ACK)) != NULL) {
1044 		teackp = (struct T_error_ack *)mp->b_rptr;
1045 		teackp->ERROR_prim = primitive;
1046 		teackp->TLI_error = t_error;
1047 		teackp->UNIX_error = sys_error;
1048 		qreply(q, mp);
1049 	}
1050 }
1051 
1052 /*
1053  * icmp_icmp_error is called by icmp_input to process ICMP
1054  * messages passed up by IP.
1055  * Generates the appropriate permanent (non-transient) errors.
1056  * Assumes that IP has pulled up everything up to and including
1057  * the ICMP header.
1058  */
1059 static void
1060 icmp_icmp_error(conn_t *connp, mblk_t *mp)
1061 {
1062 	icmph_t *icmph;
1063 	ipha_t	*ipha;
1064 	int	iph_hdr_length;
1065 	sin_t	sin;
1066 	mblk_t	*mp1;
1067 	int	error = 0;
1068 	icmp_t	*icmp = connp->conn_icmp;
1069 
1070 	ipha = (ipha_t *)mp->b_rptr;
1071 
1072 	ASSERT(OK_32PTR(mp->b_rptr));
1073 
1074 	if (IPH_HDR_VERSION(ipha) != IPV4_VERSION) {
1075 		ASSERT(IPH_HDR_VERSION(ipha) == IPV6_VERSION);
1076 		icmp_icmp_error_ipv6(connp, mp);
1077 		return;
1078 	}
1079 
1080 	/*
1081 	 * icmp does not support v4 mapped addresses
1082 	 * so we can never be here for a V6 socket
1083 	 * i.e. icmp_family == AF_INET6
1084 	 */
1085 	ASSERT((IPH_HDR_VERSION(ipha) == IPV4_VERSION) &&
1086 	    (icmp->icmp_family == AF_INET));
1087 
1088 	ASSERT(icmp->icmp_family == AF_INET);
1089 
1090 	/* Skip past the outer IP and ICMP headers */
1091 	iph_hdr_length = IPH_HDR_LENGTH(ipha);
1092 	icmph = (icmph_t *)(&mp->b_rptr[iph_hdr_length]);
1093 	ipha = (ipha_t *)&icmph[1];
1094 	iph_hdr_length = IPH_HDR_LENGTH(ipha);
1095 
1096 	switch (icmph->icmph_type) {
1097 	case ICMP_DEST_UNREACHABLE:
1098 		switch (icmph->icmph_code) {
1099 		case ICMP_FRAGMENTATION_NEEDED:
1100 			/*
1101 			 * IP has already adjusted the path MTU.
1102 			 */
1103 			break;
1104 		case ICMP_PORT_UNREACHABLE:
1105 		case ICMP_PROTOCOL_UNREACHABLE:
1106 			error = ECONNREFUSED;
1107 			break;
1108 		default:
1109 			/* Transient errors */
1110 			break;
1111 		}
1112 		break;
1113 	default:
1114 		/* Transient errors */
1115 		break;
1116 	}
1117 	if (error == 0) {
1118 		freemsg(mp);
1119 		return;
1120 	}
1121 
1122 	/*
1123 	 * Deliver T_UDERROR_IND when the application has asked for it.
1124 	 * The socket layer enables this automatically when connected.
1125 	 */
1126 	if (!icmp->icmp_dgram_errind) {
1127 		freemsg(mp);
1128 		return;
1129 	}
1130 
1131 	sin = sin_null;
1132 	sin.sin_family = AF_INET;
1133 	sin.sin_addr.s_addr = ipha->ipha_dst;
1134 	if (IPCL_IS_NONSTR(connp)) {
1135 		rw_enter(&icmp->icmp_rwlock, RW_WRITER);
1136 		if (icmp->icmp_state == TS_DATA_XFER) {
1137 			if (sin.sin_addr.s_addr ==
1138 			    V4_PART_OF_V6(icmp->icmp_v6dst.sin6_addr)) {
1139 				rw_exit(&icmp->icmp_rwlock);
1140 				(*connp->conn_upcalls->su_set_error)
1141 				    (connp->conn_upper_handle, error);
1142 				goto done;
1143 			}
1144 		} else {
1145 			icmp->icmp_delayed_error = error;
1146 			*((sin_t *)&icmp->icmp_delayed_addr) = sin;
1147 		}
1148 		rw_exit(&icmp->icmp_rwlock);
1149 	} else {
1150 
1151 		mp1 = mi_tpi_uderror_ind((char *)&sin, sizeof (sin_t), NULL,
1152 		    0, error);
1153 		if (mp1 != NULL)
1154 			putnext(connp->conn_rq, mp1);
1155 	}
1156 done:
1157 	freemsg(mp);
1158 }
1159 
1160 /*
1161  * icmp_icmp_error_ipv6 is called by icmp_icmp_error to process ICMPv6
1162  * for IPv6 packets.
1163  * Send permanent (non-transient) errors upstream.
1164  * Assumes that IP has pulled up all the extension headers as well
1165  * as the ICMPv6 header.
1166  */
1167 static void
1168 icmp_icmp_error_ipv6(conn_t *connp, mblk_t *mp)
1169 {
1170 	icmp6_t		*icmp6;
1171 	ip6_t		*ip6h, *outer_ip6h;
1172 	uint16_t	iph_hdr_length;
1173 	uint8_t		*nexthdrp;
1174 	sin6_t		sin6;
1175 	mblk_t		*mp1;
1176 	int		error = 0;
1177 	icmp_t		*icmp = connp->conn_icmp;
1178 
1179 	outer_ip6h = (ip6_t *)mp->b_rptr;
1180 	if (outer_ip6h->ip6_nxt != IPPROTO_ICMPV6)
1181 		iph_hdr_length = ip_hdr_length_v6(mp, outer_ip6h);
1182 	else
1183 		iph_hdr_length = IPV6_HDR_LEN;
1184 
1185 	icmp6 = (icmp6_t *)&mp->b_rptr[iph_hdr_length];
1186 	ip6h = (ip6_t *)&icmp6[1];
1187 	if (!ip_hdr_length_nexthdr_v6(mp, ip6h, &iph_hdr_length, &nexthdrp)) {
1188 		freemsg(mp);
1189 		return;
1190 	}
1191 
1192 	switch (icmp6->icmp6_type) {
1193 	case ICMP6_DST_UNREACH:
1194 		switch (icmp6->icmp6_code) {
1195 		case ICMP6_DST_UNREACH_NOPORT:
1196 			error = ECONNREFUSED;
1197 			break;
1198 		case ICMP6_DST_UNREACH_ADMIN:
1199 		case ICMP6_DST_UNREACH_NOROUTE:
1200 		case ICMP6_DST_UNREACH_BEYONDSCOPE:
1201 		case ICMP6_DST_UNREACH_ADDR:
1202 			/* Transient errors */
1203 			break;
1204 		default:
1205 			break;
1206 		}
1207 		break;
1208 	case ICMP6_PACKET_TOO_BIG: {
1209 		struct T_unitdata_ind	*tudi;
1210 		struct T_opthdr		*toh;
1211 		size_t			udi_size;
1212 		mblk_t			*newmp;
1213 		t_scalar_t		opt_length = sizeof (struct T_opthdr) +
1214 		    sizeof (struct ip6_mtuinfo);
1215 		sin6_t			*sin6;
1216 		struct ip6_mtuinfo	*mtuinfo;
1217 
1218 		/*
1219 		 * If the application has requested to receive path mtu
1220 		 * information, send up an empty message containing an
1221 		 * IPV6_PATHMTU ancillary data item.
1222 		 */
1223 		if (!icmp->icmp_ipv6_recvpathmtu)
1224 			break;
1225 
1226 		udi_size = sizeof (struct T_unitdata_ind) + sizeof (sin6_t) +
1227 		    opt_length;
1228 		if ((newmp = allocb(udi_size, BPRI_MED)) == NULL) {
1229 			BUMP_MIB(&icmp->icmp_is->is_rawip_mib, rawipInErrors);
1230 			break;
1231 		}
1232 
1233 		/*
1234 		 * newmp->b_cont is left to NULL on purpose.  This is an
1235 		 * empty message containing only ancillary data.
1236 		 */
1237 		newmp->b_datap->db_type = M_PROTO;
1238 		tudi = (struct T_unitdata_ind *)newmp->b_rptr;
1239 		newmp->b_wptr = (uchar_t *)tudi + udi_size;
1240 		tudi->PRIM_type = T_UNITDATA_IND;
1241 		tudi->SRC_length = sizeof (sin6_t);
1242 		tudi->SRC_offset = sizeof (struct T_unitdata_ind);
1243 		tudi->OPT_offset = tudi->SRC_offset + sizeof (sin6_t);
1244 		tudi->OPT_length = opt_length;
1245 
1246 		sin6 = (sin6_t *)&tudi[1];
1247 		bzero(sin6, sizeof (sin6_t));
1248 		sin6->sin6_family = AF_INET6;
1249 		sin6->sin6_addr = icmp->icmp_v6dst.sin6_addr;
1250 
1251 		toh = (struct T_opthdr *)&sin6[1];
1252 		toh->level = IPPROTO_IPV6;
1253 		toh->name = IPV6_PATHMTU;
1254 		toh->len = opt_length;
1255 		toh->status = 0;
1256 
1257 		mtuinfo = (struct ip6_mtuinfo *)&toh[1];
1258 		bzero(mtuinfo, sizeof (struct ip6_mtuinfo));
1259 		mtuinfo->ip6m_addr.sin6_family = AF_INET6;
1260 		mtuinfo->ip6m_addr.sin6_addr = ip6h->ip6_dst;
1261 		mtuinfo->ip6m_mtu = icmp6->icmp6_mtu;
1262 		/*
1263 		 * We've consumed everything we need from the original
1264 		 * message.  Free it, then send our empty message.
1265 		 */
1266 		freemsg(mp);
1267 		if (!IPCL_IS_NONSTR(connp)) {
1268 			putnext(connp->conn_rq, newmp);
1269 		} else {
1270 			(*connp->conn_upcalls->su_recv)
1271 			    (connp->conn_upper_handle, newmp, 0, 0, &error,
1272 			    NULL);
1273 			ASSERT(error == 0);
1274 		}
1275 		return;
1276 	}
1277 	case ICMP6_TIME_EXCEEDED:
1278 		/* Transient errors */
1279 		break;
1280 	case ICMP6_PARAM_PROB:
1281 		/* If this corresponds to an ICMP_PROTOCOL_UNREACHABLE */
1282 		if (icmp6->icmp6_code == ICMP6_PARAMPROB_NEXTHEADER &&
1283 		    (uchar_t *)ip6h + icmp6->icmp6_pptr ==
1284 		    (uchar_t *)nexthdrp) {
1285 			error = ECONNREFUSED;
1286 			break;
1287 		}
1288 		break;
1289 	}
1290 	if (error == 0) {
1291 		freemsg(mp);
1292 		return;
1293 	}
1294 
1295 	/*
1296 	 * Deliver T_UDERROR_IND when the application has asked for it.
1297 	 * The socket layer enables this automatically when connected.
1298 	 */
1299 	if (!icmp->icmp_dgram_errind) {
1300 		freemsg(mp);
1301 		return;
1302 	}
1303 
1304 	sin6 = sin6_null;
1305 	sin6.sin6_family = AF_INET6;
1306 	sin6.sin6_addr = ip6h->ip6_dst;
1307 	sin6.sin6_flowinfo = ip6h->ip6_vcf & ~IPV6_VERS_AND_FLOW_MASK;
1308 
1309 	if (IPCL_IS_NONSTR(connp)) {
1310 		rw_enter(&icmp->icmp_rwlock, RW_WRITER);
1311 		if (icmp->icmp_state == TS_DATA_XFER) {
1312 			if (IN6_ARE_ADDR_EQUAL(&sin6.sin6_addr,
1313 			    &icmp->icmp_v6dst.sin6_addr)) {
1314 				rw_exit(&icmp->icmp_rwlock);
1315 				(*connp->conn_upcalls->su_set_error)
1316 				    (connp->conn_upper_handle, error);
1317 				goto done;
1318 			}
1319 		} else {
1320 			icmp->icmp_delayed_error = error;
1321 			*((sin6_t *)&icmp->icmp_delayed_addr) = sin6;
1322 		}
1323 		rw_exit(&icmp->icmp_rwlock);
1324 	} else {
1325 
1326 		mp1 = mi_tpi_uderror_ind((char *)&sin6, sizeof (sin6_t),
1327 		    NULL, 0, error);
1328 		if (mp1 != NULL)
1329 			putnext(connp->conn_rq, mp1);
1330 	}
1331 done:
1332 	freemsg(mp);
1333 }
1334 
1335 /*
1336  * This routine responds to T_ADDR_REQ messages.  It is called by icmp_wput.
1337  * The local address is filled in if endpoint is bound. The remote address
1338  * is filled in if remote address has been precified ("connected endpoint")
1339  * (The concept of connected CLTS sockets is alien to published TPI
1340  *  but we support it anyway).
1341  */
1342 static void
1343 icmp_addr_req(queue_t *q, mblk_t *mp)
1344 {
1345 	icmp_t	*icmp = Q_TO_ICMP(q);
1346 	mblk_t	*ackmp;
1347 	struct T_addr_ack *taa;
1348 
1349 	/* Make it large enough for worst case */
1350 	ackmp = reallocb(mp, sizeof (struct T_addr_ack) +
1351 	    2 * sizeof (sin6_t), 1);
1352 	if (ackmp == NULL) {
1353 		icmp_err_ack(q, mp, TSYSERR, ENOMEM);
1354 		return;
1355 	}
1356 	taa = (struct T_addr_ack *)ackmp->b_rptr;
1357 
1358 	bzero(taa, sizeof (struct T_addr_ack));
1359 	ackmp->b_wptr = (uchar_t *)&taa[1];
1360 
1361 	taa->PRIM_type = T_ADDR_ACK;
1362 	ackmp->b_datap->db_type = M_PCPROTO;
1363 	rw_enter(&icmp->icmp_rwlock, RW_READER);
1364 	/*
1365 	 * Note: Following code assumes 32 bit alignment of basic
1366 	 * data structures like sin_t and struct T_addr_ack.
1367 	 */
1368 	if (icmp->icmp_state != TS_UNBND) {
1369 		/*
1370 		 * Fill in local address
1371 		 */
1372 		taa->LOCADDR_offset = sizeof (*taa);
1373 		if (icmp->icmp_family == AF_INET) {
1374 			sin_t	*sin;
1375 
1376 			taa->LOCADDR_length = sizeof (sin_t);
1377 			sin = (sin_t *)&taa[1];
1378 			/* Fill zeroes and then intialize non-zero fields */
1379 			*sin = sin_null;
1380 			sin->sin_family = AF_INET;
1381 			if (!IN6_IS_ADDR_V4MAPPED_ANY(&icmp->icmp_v6src) &&
1382 			    !IN6_IS_ADDR_UNSPECIFIED(&icmp->icmp_v6src)) {
1383 				IN6_V4MAPPED_TO_IPADDR(&icmp->icmp_v6src,
1384 				    sin->sin_addr.s_addr);
1385 			} else {
1386 				/*
1387 				 * INADDR_ANY
1388 				 * icmp_v6src is not set, we might be bound to
1389 				 * broadcast/multicast. Use icmp_bound_v6src as
1390 				 * local address instead (that could
1391 				 * also still be INADDR_ANY)
1392 				 */
1393 				IN6_V4MAPPED_TO_IPADDR(&icmp->icmp_bound_v6src,
1394 				    sin->sin_addr.s_addr);
1395 			}
1396 			ackmp->b_wptr = (uchar_t *)&sin[1];
1397 		} else {
1398 			sin6_t	*sin6;
1399 
1400 			ASSERT(icmp->icmp_family == AF_INET6);
1401 			taa->LOCADDR_length = sizeof (sin6_t);
1402 			sin6 = (sin6_t *)&taa[1];
1403 			/* Fill zeroes and then intialize non-zero fields */
1404 			*sin6 = sin6_null;
1405 			sin6->sin6_family = AF_INET6;
1406 			if (!IN6_IS_ADDR_UNSPECIFIED(&icmp->icmp_v6src)) {
1407 				sin6->sin6_addr = icmp->icmp_v6src;
1408 			} else {
1409 				/*
1410 				 * UNSPECIFIED
1411 				 * icmp_v6src is not set, we might be bound to
1412 				 * broadcast/multicast. Use icmp_bound_v6src as
1413 				 * local address instead (that could
1414 				 * also still be UNSPECIFIED)
1415 				 */
1416 				sin6->sin6_addr = icmp->icmp_bound_v6src;
1417 			}
1418 			ackmp->b_wptr = (uchar_t *)&sin6[1];
1419 		}
1420 	}
1421 	rw_exit(&icmp->icmp_rwlock);
1422 	ASSERT(ackmp->b_wptr <= ackmp->b_datap->db_lim);
1423 	qreply(q, ackmp);
1424 }
1425 
1426 static void
1427 icmp_copy_info(struct T_info_ack *tap, icmp_t *icmp)
1428 {
1429 	*tap = icmp_g_t_info_ack;
1430 
1431 	if (icmp->icmp_family == AF_INET6)
1432 		tap->ADDR_size = sizeof (sin6_t);
1433 	else
1434 		tap->ADDR_size = sizeof (sin_t);
1435 	tap->CURRENT_state = icmp->icmp_state;
1436 	tap->OPT_size = icmp_max_optsize;
1437 }
1438 
1439 static void
1440 icmp_do_capability_ack(icmp_t *icmp, struct T_capability_ack *tcap,
1441     t_uscalar_t cap_bits1)
1442 {
1443 	tcap->CAP_bits1 = 0;
1444 
1445 	if (cap_bits1 & TC1_INFO) {
1446 		icmp_copy_info(&tcap->INFO_ack, icmp);
1447 		tcap->CAP_bits1 |= TC1_INFO;
1448 	}
1449 }
1450 
1451 /*
1452  * This routine responds to T_CAPABILITY_REQ messages.  It is called by
1453  * icmp_wput.  Much of the T_CAPABILITY_ACK information is copied from
1454  * icmp_g_t_info_ack.  The current state of the stream is copied from
1455  * icmp_state.
1456  */
1457 static void
1458 icmp_capability_req(queue_t *q, mblk_t *mp)
1459 {
1460 	icmp_t			*icmp = Q_TO_ICMP(q);
1461 	t_uscalar_t		cap_bits1;
1462 	struct T_capability_ack	*tcap;
1463 
1464 	cap_bits1 = ((struct T_capability_req *)mp->b_rptr)->CAP_bits1;
1465 
1466 	mp = tpi_ack_alloc(mp, sizeof (struct T_capability_ack),
1467 	    mp->b_datap->db_type, T_CAPABILITY_ACK);
1468 	if (!mp)
1469 		return;
1470 
1471 	tcap = (struct T_capability_ack *)mp->b_rptr;
1472 
1473 	icmp_do_capability_ack(icmp, tcap, cap_bits1);
1474 
1475 	qreply(q, mp);
1476 }
1477 
1478 /*
1479  * This routine responds to T_INFO_REQ messages.  It is called by icmp_wput.
1480  * Most of the T_INFO_ACK information is copied from icmp_g_t_info_ack.
1481  * The current state of the stream is copied from icmp_state.
1482  */
1483 static void
1484 icmp_info_req(queue_t *q, mblk_t *mp)
1485 {
1486 	icmp_t	*icmp = Q_TO_ICMP(q);
1487 
1488 	mp = tpi_ack_alloc(mp, sizeof (struct T_info_ack), M_PCPROTO,
1489 	    T_INFO_ACK);
1490 	if (!mp)
1491 		return;
1492 	icmp_copy_info((struct T_info_ack *)mp->b_rptr, icmp);
1493 	qreply(q, mp);
1494 }
1495 
1496 /* For /dev/icmp aka AF_INET open */
1497 static int
1498 icmp_tpi_open(queue_t *q, dev_t *devp, int flag, int sflag, cred_t *credp,
1499     int family)
1500 {
1501 	conn_t *connp;
1502 	dev_t	conn_dev;
1503 	icmp_stack_t *is;
1504 	int	error;
1505 
1506 	conn_dev = NULL;
1507 
1508 	/* If the stream is already open, return immediately. */
1509 	if (q->q_ptr != NULL)
1510 		return (0);
1511 
1512 	if (sflag == MODOPEN)
1513 		return (EINVAL);
1514 
1515 	/*
1516 	 * Since ICMP is not used so heavily, allocating from the small
1517 	 * arena should be sufficient.
1518 	 */
1519 	if ((conn_dev = inet_minor_alloc(ip_minor_arena_sa)) == 0) {
1520 		return (EBUSY);
1521 	}
1522 
1523 	if (flag & SO_FALLBACK) {
1524 		/*
1525 		 * Non streams socket needs a stream to fallback to
1526 		 */
1527 		RD(q)->q_ptr = (void *)conn_dev;
1528 		WR(q)->q_qinfo = &icmp_fallback_sock_winit;
1529 		WR(q)->q_ptr = (void *)ip_minor_arena_sa;
1530 		qprocson(q);
1531 		return (0);
1532 	}
1533 
1534 	connp = icmp_open(family, credp, &error, KM_SLEEP);
1535 	if (connp == NULL) {
1536 		ASSERT(error != NULL);
1537 		inet_minor_free(ip_minor_arena_sa, connp->conn_dev);
1538 		return (error);
1539 	}
1540 
1541 	*devp = makedevice(getemajor(*devp), (minor_t)conn_dev);
1542 	connp->conn_dev = conn_dev;
1543 	connp->conn_minor_arena = ip_minor_arena_sa;
1544 
1545 	is = connp->conn_icmp->icmp_is;
1546 
1547 	/*
1548 	 * Initialize the icmp_t structure for this stream.
1549 	 */
1550 	q->q_ptr = connp;
1551 	WR(q)->q_ptr = connp;
1552 	connp->conn_rq = q;
1553 	connp->conn_wq = WR(q);
1554 
1555 	if (connp->conn_icmp->icmp_family == AF_INET6) {
1556 		/* Build initial header template for transmit */
1557 		rw_enter(&connp->conn_icmp->icmp_rwlock, RW_WRITER);
1558 		if ((error = icmp_build_hdrs(connp->conn_icmp)) != 0) {
1559 			rw_exit(&connp->conn_icmp->icmp_rwlock);
1560 			inet_minor_free(ip_minor_arena_sa, connp->conn_dev);
1561 			ipcl_conn_destroy(connp);
1562 			return (error);
1563 		}
1564 		rw_exit(&connp->conn_icmp->icmp_rwlock);
1565 	}
1566 
1567 
1568 	q->q_hiwat = is->is_recv_hiwat;
1569 	WR(q)->q_hiwat = is->is_xmit_hiwat;
1570 	WR(q)->q_lowat = is->is_xmit_lowat;
1571 
1572 	qprocson(q);
1573 
1574 	/* Set the Stream head write offset. */
1575 	(void) proto_set_tx_wroff(q, connp,
1576 	    connp->conn_icmp->icmp_max_hdr_len + is->is_wroff_extra);
1577 	(void) proto_set_rx_hiwat(connp->conn_rq, connp, q->q_hiwat);
1578 
1579 	mutex_enter(&connp->conn_lock);
1580 	connp->conn_state_flags &= ~CONN_INCIPIENT;
1581 	mutex_exit(&connp->conn_lock);
1582 
1583 	return (0);
1584 }
1585 
1586 /* For /dev/icmp4 aka AF_INET open */
1587 static int
1588 icmp_openv4(queue_t *q, dev_t *devp, int flag, int sflag, cred_t *credp)
1589 {
1590 	return (icmp_tpi_open(q, devp, flag, sflag, credp, AF_INET));
1591 }
1592 
1593 /* For /dev/icmp6 aka AF_INET6 open */
1594 static int
1595 icmp_openv6(queue_t *q, dev_t *devp, int flag, int sflag, cred_t *credp)
1596 {
1597 	return (icmp_tpi_open(q, devp, flag, sflag, credp, AF_INET6));
1598 }
1599 
1600 /*
1601  * This is the open routine for icmp.  It allocates a icmp_t structure for
1602  * the stream and, on the first open of the module, creates an ND table.
1603  */
1604 /* ARGSUSED */
1605 static conn_t *
1606 icmp_open(int family, cred_t *credp, int *err, int flags)
1607 {
1608 	icmp_t	*icmp;
1609 	conn_t *connp;
1610 	zoneid_t zoneid;
1611 	netstack_t *ns;
1612 	icmp_stack_t *is;
1613 	boolean_t isv6 = B_FALSE;
1614 
1615 	*err = secpolicy_net_icmpaccess(credp);
1616 	if (*err != 0)
1617 		return (NULL);
1618 
1619 	if (family == AF_INET6)
1620 		isv6 = B_TRUE;
1621 	ns = netstack_find_by_cred(credp);
1622 	ASSERT(ns != NULL);
1623 	is = ns->netstack_icmp;
1624 	ASSERT(is != NULL);
1625 
1626 	/*
1627 	 * For exclusive stacks we set the zoneid to zero
1628 	 * to make ICMP operate as if in the global zone.
1629 	 */
1630 	if (ns->netstack_stackid != GLOBAL_NETSTACKID)
1631 		zoneid = GLOBAL_ZONEID;
1632 	else
1633 		zoneid = crgetzoneid(credp);
1634 
1635 	ASSERT(flags == KM_SLEEP || flags == KM_NOSLEEP);
1636 
1637 	connp = ipcl_conn_create(IPCL_RAWIPCONN, flags, ns);
1638 	icmp = connp->conn_icmp;
1639 	icmp->icmp_v6dst = sin6_null;
1640 
1641 	/*
1642 	 * ipcl_conn_create did a netstack_hold. Undo the hold that was
1643 	 * done by netstack_find_by_cred()
1644 	 */
1645 	netstack_rele(ns);
1646 
1647 	rw_enter(&icmp->icmp_rwlock, RW_WRITER);
1648 	ASSERT(connp->conn_ulp == IPPROTO_ICMP);
1649 	ASSERT(connp->conn_icmp == icmp);
1650 	ASSERT(icmp->icmp_connp == connp);
1651 
1652 	/* Set the initial state of the stream and the privilege status. */
1653 	icmp->icmp_state = TS_UNBND;
1654 	if (isv6) {
1655 		icmp->icmp_ipversion = IPV6_VERSION;
1656 		icmp->icmp_family = AF_INET6;
1657 		connp->conn_ulp = IPPROTO_ICMPV6;
1658 		/* May be changed by a SO_PROTOTYPE socket option. */
1659 		icmp->icmp_proto = IPPROTO_ICMPV6;
1660 		icmp->icmp_checksum_off = 2;	/* Offset for icmp6_cksum */
1661 		icmp->icmp_max_hdr_len = IPV6_HDR_LEN;
1662 		icmp->icmp_ttl = (uint8_t)is->is_ipv6_hoplimit;
1663 		connp->conn_af_isv6 = B_TRUE;
1664 		connp->conn_flags |= IPCL_ISV6;
1665 	} else {
1666 		icmp->icmp_ipversion = IPV4_VERSION;
1667 		icmp->icmp_family = AF_INET;
1668 		/* May be changed by a SO_PROTOTYPE socket option. */
1669 		icmp->icmp_proto = IPPROTO_ICMP;
1670 		icmp->icmp_max_hdr_len = IP_SIMPLE_HDR_LENGTH;
1671 		icmp->icmp_ttl = (uint8_t)is->is_ipv4_ttl;
1672 		connp->conn_af_isv6 = B_FALSE;
1673 		connp->conn_flags &= ~IPCL_ISV6;
1674 	}
1675 	icmp->icmp_multicast_ttl = IP_DEFAULT_MULTICAST_TTL;
1676 	icmp->icmp_pending_op = -1;
1677 	connp->conn_multicast_loop = IP_DEFAULT_MULTICAST_LOOP;
1678 	connp->conn_zoneid = zoneid;
1679 
1680 	/*
1681 	 * If the caller has the process-wide flag set, then default to MAC
1682 	 * exempt mode.  This allows read-down to unlabeled hosts.
1683 	 */
1684 	if (getpflags(NET_MAC_AWARE, credp) != 0)
1685 		connp->conn_mac_exempt = B_TRUE;
1686 
1687 	connp->conn_ulp_labeled = is_system_labeled();
1688 
1689 	icmp->icmp_is = is;
1690 
1691 	connp->conn_recv = icmp_input;
1692 	crhold(credp);
1693 	connp->conn_cred = credp;
1694 
1695 	rw_exit(&icmp->icmp_rwlock);
1696 
1697 	connp->conn_flow_cntrld = B_FALSE;
1698 	return (connp);
1699 }
1700 
1701 /*
1702  * Which ICMP options OK to set through T_UNITDATA_REQ...
1703  */
1704 /* ARGSUSED */
1705 static boolean_t
1706 icmp_opt_allow_udr_set(t_scalar_t level, t_scalar_t name)
1707 {
1708 	return (B_TRUE);
1709 }
1710 
1711 /*
1712  * This routine gets default values of certain options whose default
1713  * values are maintained by protcol specific code
1714  */
1715 /* ARGSUSED */
1716 int
1717 icmp_opt_default(queue_t *q, int level, int name, uchar_t *ptr)
1718 {
1719 	icmp_t *icmp = Q_TO_ICMP(q);
1720 	icmp_stack_t *is = icmp->icmp_is;
1721 	int *i1 = (int *)ptr;
1722 
1723 	switch (level) {
1724 	case IPPROTO_IP:
1725 		switch (name) {
1726 		case IP_MULTICAST_TTL:
1727 			*ptr = (uchar_t)IP_DEFAULT_MULTICAST_TTL;
1728 			return (sizeof (uchar_t));
1729 		case IP_MULTICAST_LOOP:
1730 			*ptr = (uchar_t)IP_DEFAULT_MULTICAST_LOOP;
1731 			return (sizeof (uchar_t));
1732 		}
1733 		break;
1734 	case IPPROTO_IPV6:
1735 		switch (name) {
1736 		case IPV6_MULTICAST_HOPS:
1737 			*i1 = IP_DEFAULT_MULTICAST_TTL;
1738 			return (sizeof (int));
1739 		case IPV6_MULTICAST_LOOP:
1740 			*i1 = IP_DEFAULT_MULTICAST_LOOP;
1741 			return (sizeof (int));
1742 		case IPV6_UNICAST_HOPS:
1743 			*i1 = is->is_ipv6_hoplimit;
1744 			return (sizeof (int));
1745 		}
1746 		break;
1747 	case IPPROTO_ICMPV6:
1748 		switch (name) {
1749 		case ICMP6_FILTER:
1750 			/* Make it look like "pass all" */
1751 			ICMP6_FILTER_SETPASSALL((icmp6_filter_t *)ptr);
1752 			return (sizeof (icmp6_filter_t));
1753 		}
1754 		break;
1755 	}
1756 	return (-1);
1757 }
1758 
1759 /*
1760  * This routine retrieves the current status of socket options.
1761  * It returns the size of the option retrieved.
1762  */
1763 int
1764 icmp_opt_get(conn_t *connp, int level, int name, uchar_t *ptr)
1765 {
1766 	icmp_t		*icmp = connp->conn_icmp;
1767 	icmp_stack_t	*is = icmp->icmp_is;
1768 	int		*i1 = (int *)ptr;
1769 	ip6_pkt_t	*ipp = &icmp->icmp_sticky_ipp;
1770 	int		ret = 0;
1771 
1772 	ASSERT(RW_READ_HELD(&icmp->icmp_rwlock));
1773 	switch (level) {
1774 	case SOL_SOCKET:
1775 		switch (name) {
1776 		case SO_DEBUG:
1777 			*i1 = icmp->icmp_debug;
1778 			break;
1779 		case SO_TYPE:
1780 			*i1 = SOCK_RAW;
1781 			break;
1782 		case SO_PROTOTYPE:
1783 			*i1 = icmp->icmp_proto;
1784 			break;
1785 		case SO_REUSEADDR:
1786 			*i1 = icmp->icmp_reuseaddr;
1787 			break;
1788 
1789 		/*
1790 		 * The following three items are available here,
1791 		 * but are only meaningful to IP.
1792 		 */
1793 		case SO_DONTROUTE:
1794 			*i1 = icmp->icmp_dontroute;
1795 			break;
1796 		case SO_USELOOPBACK:
1797 			*i1 = icmp->icmp_useloopback;
1798 			break;
1799 		case SO_BROADCAST:
1800 			*i1 = icmp->icmp_broadcast;
1801 			break;
1802 
1803 		case SO_SNDBUF:
1804 			ASSERT(icmp->icmp_xmit_hiwat <= INT_MAX);
1805 			*i1 = icmp->icmp_xmit_hiwat;
1806 			break;
1807 		case SO_RCVBUF:
1808 			ASSERT(icmp->icmp_recv_hiwat <= INT_MAX);
1809 			*i1 = icmp->icmp_recv_hiwat;
1810 			break;
1811 		case SO_DGRAM_ERRIND:
1812 			*i1 = icmp->icmp_dgram_errind;
1813 			break;
1814 		case SO_TIMESTAMP:
1815 			*i1 = icmp->icmp_timestamp;
1816 			break;
1817 		case SO_MAC_EXEMPT:
1818 			*i1 = connp->conn_mac_exempt;
1819 			break;
1820 		case SO_DOMAIN:
1821 			*i1 = icmp->icmp_family;
1822 			break;
1823 
1824 		/*
1825 		 * Following four not meaningful for icmp
1826 		 * Action is same as "default" to which we fallthrough
1827 		 * so we keep them in comments.
1828 		 * case SO_LINGER:
1829 		 * case SO_KEEPALIVE:
1830 		 * case SO_OOBINLINE:
1831 		 * case SO_ALLZONES:
1832 		 */
1833 		default:
1834 			ret = -1;
1835 			goto done;
1836 		}
1837 		break;
1838 	case IPPROTO_IP:
1839 		/*
1840 		 * Only allow IPv4 option processing on IPv4 sockets.
1841 		 */
1842 		if (icmp->icmp_family != AF_INET) {
1843 			ret = -1;
1844 			goto done;
1845 		}
1846 
1847 		switch (name) {
1848 		case IP_OPTIONS:
1849 		case T_IP_OPTIONS:
1850 			/* Options are passed up with each packet */
1851 			ret = 0;
1852 			goto done;
1853 		case IP_HDRINCL:
1854 			*i1 = (int)icmp->icmp_hdrincl;
1855 			break;
1856 		case IP_TOS:
1857 		case T_IP_TOS:
1858 			*i1 = (int)icmp->icmp_type_of_service;
1859 			break;
1860 		case IP_TTL:
1861 			*i1 = (int)icmp->icmp_ttl;
1862 			break;
1863 		case IP_MULTICAST_IF:
1864 			/* 0 address if not set */
1865 			*(ipaddr_t *)ptr = icmp->icmp_multicast_if_addr;
1866 			ret = sizeof (ipaddr_t);
1867 			goto done;
1868 		case IP_MULTICAST_TTL:
1869 			*(uchar_t *)ptr = icmp->icmp_multicast_ttl;
1870 			ret = sizeof (uchar_t);
1871 			goto done;
1872 		case IP_MULTICAST_LOOP:
1873 			*ptr = connp->conn_multicast_loop;
1874 			ret = sizeof (uint8_t);
1875 			goto done;
1876 		case IP_BOUND_IF:
1877 			/* Zero if not set */
1878 			*i1 = icmp->icmp_bound_if;
1879 			break;	/* goto sizeof (int) option return */
1880 		case IP_UNSPEC_SRC:
1881 			*ptr = icmp->icmp_unspec_source;
1882 			break;	/* goto sizeof (int) option return */
1883 		case IP_RECVIF:
1884 			*ptr = icmp->icmp_recvif;
1885 			break;	/* goto sizeof (int) option return */
1886 		case IP_BROADCAST_TTL:
1887 			*(uchar_t *)ptr = connp->conn_broadcast_ttl;
1888 			return (sizeof (uchar_t));
1889 		case IP_RECVPKTINFO:
1890 			/*
1891 			 * This also handles IP_PKTINFO.
1892 			 * IP_PKTINFO and IP_RECVPKTINFO have the same value.
1893 			 * Differentiation is based on the size of the argument
1894 			 * passed in.
1895 			 * This option is handled in IP which will return an
1896 			 * error for IP_PKTINFO as it's not supported as a
1897 			 * sticky option.
1898 			 */
1899 			ret = -EINVAL;
1900 			goto done;
1901 		/*
1902 		 * Cannot "get" the value of following options
1903 		 * at this level. Action is same as "default" to
1904 		 * which we fallthrough so we keep them in comments.
1905 		 *
1906 		 * case IP_ADD_MEMBERSHIP:
1907 		 * case IP_DROP_MEMBERSHIP:
1908 		 * case IP_BLOCK_SOURCE:
1909 		 * case IP_UNBLOCK_SOURCE:
1910 		 * case IP_ADD_SOURCE_MEMBERSHIP:
1911 		 * case IP_DROP_SOURCE_MEMBERSHIP:
1912 		 * case MCAST_JOIN_GROUP:
1913 		 * case MCAST_LEAVE_GROUP:
1914 		 * case MCAST_BLOCK_SOURCE:
1915 		 * case MCAST_UNBLOCK_SOURCE:
1916 		 * case MCAST_JOIN_SOURCE_GROUP:
1917 		 * case MCAST_LEAVE_SOURCE_GROUP:
1918 		 * case MRT_INIT:
1919 		 * case MRT_DONE:
1920 		 * case MRT_ADD_VIF:
1921 		 * case MRT_DEL_VIF:
1922 		 * case MRT_ADD_MFC:
1923 		 * case MRT_DEL_MFC:
1924 		 * case MRT_VERSION:
1925 		 * case MRT_ASSERT:
1926 		 * case IP_SEC_OPT:
1927 		 * case IP_NEXTHOP:
1928 		 */
1929 		default:
1930 			ret = -1;
1931 			goto done;
1932 		}
1933 		break;
1934 	case IPPROTO_IPV6:
1935 		/*
1936 		 * Only allow IPv6 option processing on native IPv6 sockets.
1937 		 */
1938 		if (icmp->icmp_family != AF_INET6) {
1939 			ret = -1;
1940 			goto done;
1941 		}
1942 		switch (name) {
1943 		case IPV6_UNICAST_HOPS:
1944 			*i1 = (unsigned int)icmp->icmp_ttl;
1945 			break;
1946 		case IPV6_MULTICAST_IF:
1947 			/* 0 index if not set */
1948 			*i1 = icmp->icmp_multicast_if_index;
1949 			break;
1950 		case IPV6_MULTICAST_HOPS:
1951 			*i1 = icmp->icmp_multicast_ttl;
1952 			break;
1953 		case IPV6_MULTICAST_LOOP:
1954 			*i1 = connp->conn_multicast_loop;
1955 			break;
1956 		case IPV6_BOUND_IF:
1957 			/* Zero if not set */
1958 			*i1 = icmp->icmp_bound_if;
1959 			break;
1960 		case IPV6_UNSPEC_SRC:
1961 			*i1 = icmp->icmp_unspec_source;
1962 			break;
1963 		case IPV6_CHECKSUM:
1964 			/*
1965 			 * Return offset or -1 if no checksum offset.
1966 			 * Does not apply to IPPROTO_ICMPV6
1967 			 */
1968 			if (icmp->icmp_proto == IPPROTO_ICMPV6) {
1969 				ret = -1;
1970 				goto done;
1971 			}
1972 
1973 			if (icmp->icmp_raw_checksum) {
1974 				*i1 = icmp->icmp_checksum_off;
1975 			} else {
1976 				*i1 = -1;
1977 			}
1978 			break;
1979 		case IPV6_JOIN_GROUP:
1980 		case IPV6_LEAVE_GROUP:
1981 		case MCAST_JOIN_GROUP:
1982 		case MCAST_LEAVE_GROUP:
1983 		case MCAST_BLOCK_SOURCE:
1984 		case MCAST_UNBLOCK_SOURCE:
1985 		case MCAST_JOIN_SOURCE_GROUP:
1986 		case MCAST_LEAVE_SOURCE_GROUP:
1987 			/* cannot "get" the value for these */
1988 			ret = -1;
1989 			goto done;
1990 		case IPV6_RECVPKTINFO:
1991 			*i1 = icmp->icmp_ip_recvpktinfo;
1992 			break;
1993 		case IPV6_RECVTCLASS:
1994 			*i1 = icmp->icmp_ipv6_recvtclass;
1995 			break;
1996 		case IPV6_RECVPATHMTU:
1997 			*i1 = icmp->icmp_ipv6_recvpathmtu;
1998 			break;
1999 		case IPV6_V6ONLY:
2000 			*i1 = 1;
2001 			break;
2002 		case IPV6_RECVHOPLIMIT:
2003 			*i1 = icmp->icmp_ipv6_recvhoplimit;
2004 			break;
2005 		case IPV6_RECVHOPOPTS:
2006 			*i1 = icmp->icmp_ipv6_recvhopopts;
2007 			break;
2008 		case IPV6_RECVDSTOPTS:
2009 			*i1 = icmp->icmp_ipv6_recvdstopts;
2010 			break;
2011 		case _OLD_IPV6_RECVDSTOPTS:
2012 			*i1 = icmp->icmp_old_ipv6_recvdstopts;
2013 			break;
2014 		case IPV6_RECVRTHDRDSTOPTS:
2015 			*i1 = icmp->icmp_ipv6_recvrtdstopts;
2016 			break;
2017 		case IPV6_RECVRTHDR:
2018 			*i1 = icmp->icmp_ipv6_recvrthdr;
2019 			break;
2020 		case IPV6_PKTINFO: {
2021 			/* XXX assumes that caller has room for max size! */
2022 			struct in6_pktinfo *pkti;
2023 
2024 			pkti = (struct in6_pktinfo *)ptr;
2025 			if (ipp->ipp_fields & IPPF_IFINDEX)
2026 				pkti->ipi6_ifindex = ipp->ipp_ifindex;
2027 			else
2028 				pkti->ipi6_ifindex = 0;
2029 			if (ipp->ipp_fields & IPPF_ADDR)
2030 				pkti->ipi6_addr = ipp->ipp_addr;
2031 			else
2032 				pkti->ipi6_addr = ipv6_all_zeros;
2033 			ret = sizeof (struct in6_pktinfo);
2034 			goto done;
2035 		}
2036 		case IPV6_NEXTHOP: {
2037 			sin6_t *sin6 = (sin6_t *)ptr;
2038 
2039 			if (!(ipp->ipp_fields & IPPF_NEXTHOP))
2040 				return (0);
2041 			*sin6 = sin6_null;
2042 			sin6->sin6_family = AF_INET6;
2043 			sin6->sin6_addr = ipp->ipp_nexthop;
2044 			ret = (sizeof (sin6_t));
2045 			goto done;
2046 		}
2047 		case IPV6_HOPOPTS:
2048 			if (!(ipp->ipp_fields & IPPF_HOPOPTS))
2049 				return (0);
2050 			if (ipp->ipp_hopoptslen <= icmp->icmp_label_len_v6)
2051 				return (0);
2052 			bcopy((char *)ipp->ipp_hopopts +
2053 			    icmp->icmp_label_len_v6, ptr,
2054 			    ipp->ipp_hopoptslen - icmp->icmp_label_len_v6);
2055 			if (icmp->icmp_label_len_v6 > 0) {
2056 				ptr[0] = ((char *)ipp->ipp_hopopts)[0];
2057 				ptr[1] = (ipp->ipp_hopoptslen -
2058 				    icmp->icmp_label_len_v6 + 7) / 8 - 1;
2059 			}
2060 			ret = (ipp->ipp_hopoptslen - icmp->icmp_label_len_v6);
2061 			goto done;
2062 		case IPV6_RTHDRDSTOPTS:
2063 			if (!(ipp->ipp_fields & IPPF_RTDSTOPTS))
2064 				return (0);
2065 			bcopy(ipp->ipp_rtdstopts, ptr, ipp->ipp_rtdstoptslen);
2066 			ret = ipp->ipp_rtdstoptslen;
2067 			goto done;
2068 		case IPV6_RTHDR:
2069 			if (!(ipp->ipp_fields & IPPF_RTHDR))
2070 				return (0);
2071 			bcopy(ipp->ipp_rthdr, ptr, ipp->ipp_rthdrlen);
2072 			ret = ipp->ipp_rthdrlen;
2073 			goto done;
2074 		case IPV6_DSTOPTS:
2075 			if (!(ipp->ipp_fields & IPPF_DSTOPTS)) {
2076 				ret = 0;
2077 				goto done;
2078 			}
2079 			bcopy(ipp->ipp_dstopts, ptr, ipp->ipp_dstoptslen);
2080 			ret = ipp->ipp_dstoptslen;
2081 			goto done;
2082 		case IPV6_PATHMTU:
2083 			if (!(ipp->ipp_fields & IPPF_PATHMTU)) {
2084 				ret = 0;
2085 			} else {
2086 				ret = ip_fill_mtuinfo(
2087 				    &icmp->icmp_v6dst.sin6_addr, 0,
2088 				    (struct ip6_mtuinfo *)ptr,
2089 				    is->is_netstack);
2090 			}
2091 			goto done;
2092 		case IPV6_TCLASS:
2093 			if (ipp->ipp_fields & IPPF_TCLASS)
2094 				*i1 = ipp->ipp_tclass;
2095 			else
2096 				*i1 = IPV6_FLOW_TCLASS(
2097 				    IPV6_DEFAULT_VERS_AND_FLOW);
2098 			break;
2099 		default:
2100 			ret = -1;
2101 			goto done;
2102 		}
2103 		break;
2104 	case IPPROTO_ICMPV6:
2105 		/*
2106 		 * Only allow IPv6 option processing on native IPv6 sockets.
2107 		 */
2108 		if (icmp->icmp_family != AF_INET6) {
2109 			ret = -1;
2110 		}
2111 
2112 		if (icmp->icmp_proto != IPPROTO_ICMPV6) {
2113 			ret = -1;
2114 		}
2115 
2116 		switch (name) {
2117 		case ICMP6_FILTER:
2118 			if (icmp->icmp_filter == NULL) {
2119 				/* Make it look like "pass all" */
2120 				ICMP6_FILTER_SETPASSALL((icmp6_filter_t *)ptr);
2121 			} else {
2122 				(void) bcopy(icmp->icmp_filter, ptr,
2123 				    sizeof (icmp6_filter_t));
2124 			}
2125 			ret = sizeof (icmp6_filter_t);
2126 			goto done;
2127 		default:
2128 			ret = -1;
2129 			goto done;
2130 		}
2131 	default:
2132 		ret = -1;
2133 		goto done;
2134 	}
2135 	ret = sizeof (int);
2136 done:
2137 	return (ret);
2138 }
2139 
2140 /*
2141  * This routine retrieves the current status of socket options.
2142  * It returns the size of the option retrieved.
2143  */
2144 int
2145 icmp_tpi_opt_get(queue_t *q, int level, int name, uchar_t *ptr)
2146 {
2147 	conn_t  *connp = Q_TO_CONN(q);
2148 	icmp_t	*icmp = connp->conn_icmp;
2149 	int 	err;
2150 
2151 	rw_enter(&icmp->icmp_rwlock, RW_READER);
2152 	err = icmp_opt_get(connp, level, name, ptr);
2153 	rw_exit(&icmp->icmp_rwlock);
2154 	return (err);
2155 }
2156 
2157 int
2158 icmp_do_opt_set(conn_t *connp, int level, int name, uint_t inlen,
2159     uchar_t *invalp, uint_t *outlenp, uchar_t *outvalp, cred_t *cr,
2160     void *thisdg_attrs, boolean_t checkonly)
2161 {
2162 
2163 	int	*i1 = (int *)invalp;
2164 	boolean_t onoff = (*i1 == 0) ? 0 : 1;
2165 	icmp_t *icmp = connp->conn_icmp;
2166 	icmp_stack_t *is = icmp->icmp_is;
2167 	int	error;
2168 
2169 	ASSERT(RW_WRITE_HELD(&icmp->icmp_rwlock));
2170 	/*
2171 	 * For fixed length options, no sanity check
2172 	 * of passed in length is done. It is assumed *_optcom_req()
2173 	 * routines do the right thing.
2174 	 */
2175 	switch (level) {
2176 	case SOL_SOCKET:
2177 		switch (name) {
2178 		case SO_DEBUG:
2179 			if (!checkonly)
2180 				icmp->icmp_debug = onoff;
2181 			break;
2182 		case SO_PROTOTYPE:
2183 			if ((*i1 & 0xFF) != IPPROTO_ICMP &&
2184 			    (*i1 & 0xFF) != IPPROTO_ICMPV6 &&
2185 			    secpolicy_net_rawaccess(cr) != 0) {
2186 				*outlenp = 0;
2187 				return (EACCES);
2188 			}
2189 			/* Can't use IPPROTO_RAW with IPv6 */
2190 			if ((*i1 & 0xFF) == IPPROTO_RAW &&
2191 			    icmp->icmp_family == AF_INET6) {
2192 				*outlenp = 0;
2193 				return (EPROTONOSUPPORT);
2194 			}
2195 			if (checkonly) {
2196 				/* T_CHECK case */
2197 				*(int *)outvalp = (*i1 & 0xFF);
2198 				break;
2199 			}
2200 			icmp->icmp_proto = *i1 & 0xFF;
2201 			if ((icmp->icmp_proto == IPPROTO_RAW ||
2202 			    icmp->icmp_proto == IPPROTO_IGMP) &&
2203 			    icmp->icmp_family == AF_INET)
2204 				icmp->icmp_hdrincl = 1;
2205 			else
2206 				icmp->icmp_hdrincl = 0;
2207 
2208 			if (icmp->icmp_family == AF_INET6 &&
2209 			    icmp->icmp_proto == IPPROTO_ICMPV6) {
2210 				/* Set offset for icmp6_cksum */
2211 				icmp->icmp_raw_checksum = 0;
2212 				icmp->icmp_checksum_off = 2;
2213 			}
2214 			if (icmp->icmp_proto == IPPROTO_UDP ||
2215 			    icmp->icmp_proto == IPPROTO_TCP ||
2216 			    icmp->icmp_proto == IPPROTO_SCTP) {
2217 				icmp->icmp_no_tp_cksum = 1;
2218 				icmp->icmp_sticky_ipp.ipp_fields |=
2219 				    IPPF_NO_CKSUM;
2220 			} else {
2221 				icmp->icmp_no_tp_cksum = 0;
2222 				icmp->icmp_sticky_ipp.ipp_fields &=
2223 				    ~IPPF_NO_CKSUM;
2224 			}
2225 
2226 			if (icmp->icmp_filter != NULL &&
2227 			    icmp->icmp_proto != IPPROTO_ICMPV6) {
2228 				kmem_free(icmp->icmp_filter,
2229 				    sizeof (icmp6_filter_t));
2230 				icmp->icmp_filter = NULL;
2231 			}
2232 
2233 			/* Rebuild the header template */
2234 			error = icmp_build_hdrs(icmp);
2235 			if (error != 0) {
2236 				*outlenp = 0;
2237 				return (error);
2238 			}
2239 
2240 			/*
2241 			 * For SCTP, we don't use icmp_bind_proto() for
2242 			 * raw socket binding.  Note that we do not need
2243 			 * to set *outlenp.
2244 			 * FIXME: how does SCTP work?
2245 			 */
2246 			if (icmp->icmp_proto == IPPROTO_SCTP)
2247 				return (0);
2248 
2249 			*outlenp = sizeof (int);
2250 			*(int *)outvalp = *i1 & 0xFF;
2251 
2252 			/* Drop lock across the bind operation */
2253 			rw_exit(&icmp->icmp_rwlock);
2254 			(void) icmp_bind_proto(connp);
2255 			rw_enter(&icmp->icmp_rwlock, RW_WRITER);
2256 			return (0);
2257 		case SO_REUSEADDR:
2258 			if (!checkonly) {
2259 				icmp->icmp_reuseaddr = onoff;
2260 				PASS_OPT_TO_IP(connp);
2261 			}
2262 			break;
2263 
2264 		/*
2265 		 * The following three items are available here,
2266 		 * but are only meaningful to IP.
2267 		 */
2268 		case SO_DONTROUTE:
2269 			if (!checkonly) {
2270 				icmp->icmp_dontroute = onoff;
2271 				PASS_OPT_TO_IP(connp);
2272 			}
2273 			break;
2274 		case SO_USELOOPBACK:
2275 			if (!checkonly) {
2276 				icmp->icmp_useloopback = onoff;
2277 				PASS_OPT_TO_IP(connp);
2278 			}
2279 			break;
2280 		case SO_BROADCAST:
2281 			if (!checkonly) {
2282 				icmp->icmp_broadcast = onoff;
2283 				PASS_OPT_TO_IP(connp);
2284 			}
2285 			break;
2286 
2287 		case SO_SNDBUF:
2288 			if (*i1 > is->is_max_buf) {
2289 				*outlenp = 0;
2290 				return (ENOBUFS);
2291 			}
2292 			if (!checkonly) {
2293 				if (!IPCL_IS_NONSTR(connp)) {
2294 					connp->conn_wq->q_hiwat = *i1;
2295 				}
2296 				icmp->icmp_xmit_hiwat = *i1;
2297 			}
2298 			break;
2299 		case SO_RCVBUF:
2300 			if (*i1 > is->is_max_buf) {
2301 				*outlenp = 0;
2302 				return (ENOBUFS);
2303 			}
2304 			if (!checkonly) {
2305 				icmp->icmp_recv_hiwat = *i1;
2306 				rw_exit(&icmp->icmp_rwlock);
2307 				(void) proto_set_rx_hiwat(connp->conn_rq, connp,
2308 				    *i1);
2309 				rw_enter(&icmp->icmp_rwlock, RW_WRITER);
2310 			}
2311 			break;
2312 		case SO_DGRAM_ERRIND:
2313 			if (!checkonly)
2314 				icmp->icmp_dgram_errind = onoff;
2315 			break;
2316 		case SO_ALLZONES:
2317 			/*
2318 			 * "soft" error (negative)
2319 			 * option not handled at this level
2320 			 * Note: Do not modify *outlenp
2321 			 */
2322 			return (-EINVAL);
2323 		case SO_TIMESTAMP:
2324 			if (!checkonly) {
2325 				icmp->icmp_timestamp = onoff;
2326 			}
2327 			break;
2328 		case SO_MAC_EXEMPT:
2329 			/*
2330 			 * "soft" error (negative)
2331 			 * option not handled at this level
2332 			 * Note: Do not modify *outlenp
2333 			 */
2334 			return (-EINVAL);
2335 		case SO_RCVTIMEO:
2336 		case SO_SNDTIMEO:
2337 			/*
2338 			 * Pass these two options in order for third part
2339 			 * protocol usage. Here just return directly.
2340 			 */
2341 			return (0);
2342 		/*
2343 		 * Following three not meaningful for icmp
2344 		 * Action is same as "default" so we keep them
2345 		 * in comments.
2346 		 * case SO_LINGER:
2347 		 * case SO_KEEPALIVE:
2348 		 * case SO_OOBINLINE:
2349 		 */
2350 		default:
2351 			*outlenp = 0;
2352 			return (EINVAL);
2353 		}
2354 		break;
2355 	case IPPROTO_IP:
2356 		/*
2357 		 * Only allow IPv4 option processing on IPv4 sockets.
2358 		 */
2359 		if (icmp->icmp_family != AF_INET) {
2360 			*outlenp = 0;
2361 			return (ENOPROTOOPT);
2362 		}
2363 		switch (name) {
2364 		case IP_OPTIONS:
2365 		case T_IP_OPTIONS:
2366 			/* Save options for use by IP. */
2367 			if ((inlen & 0x3) ||
2368 			    inlen + icmp->icmp_label_len > IP_MAX_OPT_LENGTH) {
2369 				*outlenp = 0;
2370 				return (EINVAL);
2371 			}
2372 			if (checkonly)
2373 				break;
2374 
2375 			if (!tsol_option_set(&icmp->icmp_ip_snd_options,
2376 			    &icmp->icmp_ip_snd_options_len,
2377 			    icmp->icmp_label_len, invalp, inlen)) {
2378 				*outlenp = 0;
2379 				return (ENOMEM);
2380 			}
2381 
2382 			icmp->icmp_max_hdr_len = IP_SIMPLE_HDR_LENGTH +
2383 			    icmp->icmp_ip_snd_options_len;
2384 			rw_exit(&icmp->icmp_rwlock);
2385 			(void) proto_set_tx_wroff(connp->conn_rq == NULL ? NULL:
2386 			    RD(connp->conn_rq), connp,
2387 			    icmp->icmp_max_hdr_len + is->is_wroff_extra);
2388 			rw_enter(&icmp->icmp_rwlock, RW_WRITER);
2389 			break;
2390 		case IP_HDRINCL:
2391 			if (!checkonly)
2392 				icmp->icmp_hdrincl = onoff;
2393 			break;
2394 		case IP_TOS:
2395 		case T_IP_TOS:
2396 			if (!checkonly) {
2397 				icmp->icmp_type_of_service = (uint8_t)*i1;
2398 			}
2399 			break;
2400 		case IP_TTL:
2401 			if (!checkonly) {
2402 				icmp->icmp_ttl = (uint8_t)*i1;
2403 			}
2404 			break;
2405 		case IP_MULTICAST_IF:
2406 			/*
2407 			 * TODO should check OPTMGMT reply and undo this if
2408 			 * there is an error.
2409 			 */
2410 			if (!checkonly) {
2411 				icmp->icmp_multicast_if_addr = *i1;
2412 				PASS_OPT_TO_IP(connp);
2413 			}
2414 			break;
2415 		case IP_MULTICAST_TTL:
2416 			if (!checkonly)
2417 				icmp->icmp_multicast_ttl = *invalp;
2418 			break;
2419 		case IP_MULTICAST_LOOP:
2420 			if (!checkonly) {
2421 				connp->conn_multicast_loop =
2422 				    (*invalp == 0) ? 0 : 1;
2423 				PASS_OPT_TO_IP(connp);
2424 			}
2425 			break;
2426 		case IP_BOUND_IF:
2427 			if (!checkonly) {
2428 				icmp->icmp_bound_if = *i1;
2429 				PASS_OPT_TO_IP(connp);
2430 			}
2431 			break;
2432 		case IP_UNSPEC_SRC:
2433 			if (!checkonly) {
2434 				icmp->icmp_unspec_source = onoff;
2435 				PASS_OPT_TO_IP(connp);
2436 			}
2437 			break;
2438 		case IP_BROADCAST_TTL:
2439 			if (!checkonly)
2440 				connp->conn_broadcast_ttl = *invalp;
2441 			break;
2442 		case IP_RECVIF:
2443 			if (!checkonly) {
2444 				icmp->icmp_recvif = onoff;
2445 			}
2446 			/*
2447 			 * pass to ip
2448 			 */
2449 			return (-EINVAL);
2450 		case IP_PKTINFO: {
2451 			/*
2452 			 * This also handles IP_RECVPKTINFO.
2453 			 * IP_PKTINFO and IP_RECVPKTINFO have the same value.
2454 			 * Differentiation is based on the size of the argument
2455 			 * passed in.
2456 			 */
2457 			struct in_pktinfo *pktinfop;
2458 			ip4_pkt_t *attr_pktinfop;
2459 
2460 			if (checkonly)
2461 				break;
2462 
2463 			if (inlen == sizeof (int)) {
2464 				/*
2465 				 * This is IP_RECVPKTINFO option.
2466 				 * Keep a local copy of wether this option is
2467 				 * set or not and pass it down to IP for
2468 				 * processing.
2469 				 */
2470 				icmp->icmp_ip_recvpktinfo = onoff;
2471 				return (-EINVAL);
2472 			}
2473 
2474 
2475 			if (inlen != sizeof (struct in_pktinfo)) {
2476 				return (EINVAL);
2477 			}
2478 
2479 			if ((attr_pktinfop = (ip4_pkt_t *)thisdg_attrs)
2480 			    == NULL) {
2481 				/*
2482 				 * sticky option is not supported
2483 				 */
2484 				return (EINVAL);
2485 			}
2486 
2487 			pktinfop = (struct in_pktinfo *)invalp;
2488 
2489 			/*
2490 			 * Atleast one of the values should be specified
2491 			 */
2492 			if (pktinfop->ipi_ifindex == 0 &&
2493 			    pktinfop->ipi_spec_dst.s_addr == INADDR_ANY) {
2494 				return (EINVAL);
2495 			}
2496 
2497 			attr_pktinfop->ip4_addr = pktinfop->ipi_spec_dst.s_addr;
2498 			attr_pktinfop->ip4_ill_index = pktinfop->ipi_ifindex;
2499 		}
2500 			break;
2501 		case IP_ADD_MEMBERSHIP:
2502 		case IP_DROP_MEMBERSHIP:
2503 		case IP_BLOCK_SOURCE:
2504 		case IP_UNBLOCK_SOURCE:
2505 		case IP_ADD_SOURCE_MEMBERSHIP:
2506 		case IP_DROP_SOURCE_MEMBERSHIP:
2507 		case MCAST_JOIN_GROUP:
2508 		case MCAST_LEAVE_GROUP:
2509 		case MCAST_BLOCK_SOURCE:
2510 		case MCAST_UNBLOCK_SOURCE:
2511 		case MCAST_JOIN_SOURCE_GROUP:
2512 		case MCAST_LEAVE_SOURCE_GROUP:
2513 		case MRT_INIT:
2514 		case MRT_DONE:
2515 		case MRT_ADD_VIF:
2516 		case MRT_DEL_VIF:
2517 		case MRT_ADD_MFC:
2518 		case MRT_DEL_MFC:
2519 		case MRT_VERSION:
2520 		case MRT_ASSERT:
2521 		case IP_SEC_OPT:
2522 		case IP_NEXTHOP:
2523 			/*
2524 			 * "soft" error (negative)
2525 			 * option not handled at this level
2526 			 * Note: Do not modify *outlenp
2527 			 */
2528 			return (-EINVAL);
2529 		default:
2530 			*outlenp = 0;
2531 			return (EINVAL);
2532 		}
2533 		break;
2534 	case IPPROTO_IPV6: {
2535 		ip6_pkt_t		*ipp;
2536 		boolean_t		sticky;
2537 
2538 		if (icmp->icmp_family != AF_INET6) {
2539 			*outlenp = 0;
2540 			return (ENOPROTOOPT);
2541 		}
2542 		/*
2543 		 * Deal with both sticky options and ancillary data
2544 		 */
2545 		if (thisdg_attrs == NULL) {
2546 			/* sticky options, or none */
2547 			ipp = &icmp->icmp_sticky_ipp;
2548 			sticky = B_TRUE;
2549 		} else {
2550 			/* ancillary data */
2551 			ipp = (ip6_pkt_t *)thisdg_attrs;
2552 			sticky = B_FALSE;
2553 		}
2554 
2555 		switch (name) {
2556 		case IPV6_MULTICAST_IF:
2557 			if (!checkonly) {
2558 				icmp->icmp_multicast_if_index = *i1;
2559 				PASS_OPT_TO_IP(connp);
2560 			}
2561 			break;
2562 		case IPV6_UNICAST_HOPS:
2563 			/* -1 means use default */
2564 			if (*i1 < -1 || *i1 > IPV6_MAX_HOPS) {
2565 				*outlenp = 0;
2566 				return (EINVAL);
2567 			}
2568 			if (!checkonly) {
2569 				if (*i1 == -1) {
2570 					icmp->icmp_ttl = ipp->ipp_unicast_hops =
2571 					    is->is_ipv6_hoplimit;
2572 					ipp->ipp_fields &= ~IPPF_UNICAST_HOPS;
2573 					/* Pass modified value to IP. */
2574 					*i1 = ipp->ipp_hoplimit;
2575 				} else {
2576 					icmp->icmp_ttl = ipp->ipp_unicast_hops =
2577 					    (uint8_t)*i1;
2578 					ipp->ipp_fields |= IPPF_UNICAST_HOPS;
2579 				}
2580 				/* Rebuild the header template */
2581 				error = icmp_build_hdrs(icmp);
2582 				if (error != 0) {
2583 					*outlenp = 0;
2584 					return (error);
2585 				}
2586 			}
2587 			break;
2588 		case IPV6_MULTICAST_HOPS:
2589 			/* -1 means use default */
2590 			if (*i1 < -1 || *i1 > IPV6_MAX_HOPS) {
2591 				*outlenp = 0;
2592 				return (EINVAL);
2593 			}
2594 			if (!checkonly) {
2595 				if (*i1 == -1) {
2596 					icmp->icmp_multicast_ttl =
2597 					    ipp->ipp_multicast_hops =
2598 					    IP_DEFAULT_MULTICAST_TTL;
2599 					ipp->ipp_fields &= ~IPPF_MULTICAST_HOPS;
2600 					/* Pass modified value to IP. */
2601 					*i1 = icmp->icmp_multicast_ttl;
2602 				} else {
2603 					icmp->icmp_multicast_ttl =
2604 					    ipp->ipp_multicast_hops =
2605 					    (uint8_t)*i1;
2606 					ipp->ipp_fields |= IPPF_MULTICAST_HOPS;
2607 				}
2608 			}
2609 			break;
2610 		case IPV6_MULTICAST_LOOP:
2611 			if (*i1 != 0 && *i1 != 1) {
2612 				*outlenp = 0;
2613 				return (EINVAL);
2614 			}
2615 			if (!checkonly) {
2616 				connp->conn_multicast_loop = *i1;
2617 				PASS_OPT_TO_IP(connp);
2618 			}
2619 			break;
2620 		case IPV6_CHECKSUM:
2621 			/*
2622 			 * Integer offset into the user data of where the
2623 			 * checksum is located.
2624 			 * Offset of -1 disables option.
2625 			 * Does not apply to IPPROTO_ICMPV6.
2626 			 */
2627 			if (icmp->icmp_proto == IPPROTO_ICMPV6 || !sticky) {
2628 				*outlenp = 0;
2629 				return (EINVAL);
2630 			}
2631 			if ((*i1 != -1) && ((*i1 < 0) || (*i1 & 0x1) != 0)) {
2632 				/* Negative or not 16 bit aligned offset */
2633 				*outlenp = 0;
2634 				return (EINVAL);
2635 			}
2636 			if (checkonly)
2637 				break;
2638 
2639 			if (*i1 == -1) {
2640 				icmp->icmp_raw_checksum = 0;
2641 				ipp->ipp_fields &= ~IPPF_RAW_CKSUM;
2642 			} else {
2643 				icmp->icmp_raw_checksum = 1;
2644 				icmp->icmp_checksum_off = *i1;
2645 				ipp->ipp_fields |= IPPF_RAW_CKSUM;
2646 			}
2647 			/* Rebuild the header template */
2648 			error = icmp_build_hdrs(icmp);
2649 			if (error != 0) {
2650 				*outlenp = 0;
2651 				return (error);
2652 			}
2653 			break;
2654 		case IPV6_JOIN_GROUP:
2655 		case IPV6_LEAVE_GROUP:
2656 		case MCAST_JOIN_GROUP:
2657 		case MCAST_LEAVE_GROUP:
2658 		case MCAST_BLOCK_SOURCE:
2659 		case MCAST_UNBLOCK_SOURCE:
2660 		case MCAST_JOIN_SOURCE_GROUP:
2661 		case MCAST_LEAVE_SOURCE_GROUP:
2662 			/*
2663 			 * "soft" error (negative)
2664 			 * option not handled at this level
2665 			 * Note: Do not modify *outlenp
2666 			 */
2667 			return (-EINVAL);
2668 		case IPV6_BOUND_IF:
2669 			if (!checkonly) {
2670 				icmp->icmp_bound_if = *i1;
2671 				PASS_OPT_TO_IP(connp);
2672 			}
2673 			break;
2674 		case IPV6_UNSPEC_SRC:
2675 			if (!checkonly) {
2676 				icmp->icmp_unspec_source = onoff;
2677 				PASS_OPT_TO_IP(connp);
2678 			}
2679 			break;
2680 		case IPV6_RECVTCLASS:
2681 			if (!checkonly) {
2682 				icmp->icmp_ipv6_recvtclass = onoff;
2683 				PASS_OPT_TO_IP(connp);
2684 			}
2685 			break;
2686 		/*
2687 		 * Set boolean switches for ancillary data delivery
2688 		 */
2689 		case IPV6_RECVPKTINFO:
2690 			if (!checkonly) {
2691 				icmp->icmp_ip_recvpktinfo = onoff;
2692 				PASS_OPT_TO_IP(connp);
2693 			}
2694 			break;
2695 		case IPV6_RECVPATHMTU:
2696 			if (!checkonly) {
2697 				icmp->icmp_ipv6_recvpathmtu = onoff;
2698 				PASS_OPT_TO_IP(connp);
2699 			}
2700 			break;
2701 		case IPV6_RECVHOPLIMIT:
2702 			if (!checkonly) {
2703 				icmp->icmp_ipv6_recvhoplimit = onoff;
2704 				PASS_OPT_TO_IP(connp);
2705 			}
2706 			break;
2707 		case IPV6_RECVHOPOPTS:
2708 			if (!checkonly) {
2709 				icmp->icmp_ipv6_recvhopopts = onoff;
2710 				PASS_OPT_TO_IP(connp);
2711 			}
2712 			break;
2713 		case IPV6_RECVDSTOPTS:
2714 			if (!checkonly) {
2715 				icmp->icmp_ipv6_recvdstopts = onoff;
2716 				PASS_OPT_TO_IP(connp);
2717 			}
2718 			break;
2719 		case _OLD_IPV6_RECVDSTOPTS:
2720 			if (!checkonly)
2721 				icmp->icmp_old_ipv6_recvdstopts = onoff;
2722 			break;
2723 		case IPV6_RECVRTHDRDSTOPTS:
2724 			if (!checkonly) {
2725 				icmp->icmp_ipv6_recvrtdstopts = onoff;
2726 				PASS_OPT_TO_IP(connp);
2727 			}
2728 			break;
2729 		case IPV6_RECVRTHDR:
2730 			if (!checkonly) {
2731 				icmp->icmp_ipv6_recvrthdr = onoff;
2732 				PASS_OPT_TO_IP(connp);
2733 			}
2734 			break;
2735 		/*
2736 		 * Set sticky options or ancillary data.
2737 		 * If sticky options, (re)build any extension headers
2738 		 * that might be needed as a result.
2739 		 */
2740 		case IPV6_PKTINFO:
2741 			/*
2742 			 * The source address and ifindex are verified
2743 			 * in ip_opt_set(). For ancillary data the
2744 			 * source address is checked in ip_wput_v6.
2745 			 */
2746 			if (inlen != 0 && inlen !=
2747 			    sizeof (struct in6_pktinfo)) {
2748 				return (EINVAL);
2749 			}
2750 			if (checkonly)
2751 				break;
2752 
2753 			if (inlen == 0) {
2754 				ipp->ipp_fields &= ~(IPPF_IFINDEX|IPPF_ADDR);
2755 				ipp->ipp_sticky_ignored |=
2756 				    (IPPF_IFINDEX|IPPF_ADDR);
2757 			} else {
2758 				struct in6_pktinfo *pkti;
2759 
2760 				pkti = (struct in6_pktinfo *)invalp;
2761 				ipp->ipp_ifindex = pkti->ipi6_ifindex;
2762 				ipp->ipp_addr = pkti->ipi6_addr;
2763 				if (ipp->ipp_ifindex != 0)
2764 					ipp->ipp_fields |= IPPF_IFINDEX;
2765 				else
2766 					ipp->ipp_fields &= ~IPPF_IFINDEX;
2767 				if (!IN6_IS_ADDR_UNSPECIFIED(
2768 				    &ipp->ipp_addr))
2769 					ipp->ipp_fields |= IPPF_ADDR;
2770 				else
2771 					ipp->ipp_fields &= ~IPPF_ADDR;
2772 			}
2773 			if (sticky) {
2774 				error = icmp_build_hdrs(icmp);
2775 				if (error != 0)
2776 					return (error);
2777 				PASS_OPT_TO_IP(connp);
2778 			}
2779 			break;
2780 		case IPV6_HOPLIMIT:
2781 			/* This option can only be used as ancillary data. */
2782 			if (sticky)
2783 				return (EINVAL);
2784 			if (inlen != 0 && inlen != sizeof (int))
2785 				return (EINVAL);
2786 			if (checkonly)
2787 				break;
2788 
2789 			if (inlen == 0) {
2790 				ipp->ipp_fields &= ~IPPF_HOPLIMIT;
2791 				ipp->ipp_sticky_ignored |= IPPF_HOPLIMIT;
2792 			} else {
2793 				if (*i1 > 255 || *i1 < -1)
2794 					return (EINVAL);
2795 				if (*i1 == -1)
2796 					ipp->ipp_hoplimit =
2797 					    is->is_ipv6_hoplimit;
2798 				else
2799 					ipp->ipp_hoplimit = *i1;
2800 				ipp->ipp_fields |= IPPF_HOPLIMIT;
2801 			}
2802 			break;
2803 		case IPV6_TCLASS:
2804 			/*
2805 			 * IPV6_RECVTCLASS accepts -1 as use kernel default
2806 			 * and [0, 255] as the actualy traffic class.
2807 			 */
2808 			if (inlen != 0 && inlen != sizeof (int)) {
2809 				return (EINVAL);
2810 			}
2811 			if (checkonly)
2812 				break;
2813 
2814 			if (inlen == 0) {
2815 				ipp->ipp_fields &= ~IPPF_TCLASS;
2816 				ipp->ipp_sticky_ignored |= IPPF_TCLASS;
2817 			} else {
2818 				if (*i1 >= 256 || *i1 < -1)
2819 					return (EINVAL);
2820 				if (*i1 == -1) {
2821 					ipp->ipp_tclass =
2822 					    IPV6_FLOW_TCLASS(
2823 					    IPV6_DEFAULT_VERS_AND_FLOW);
2824 				} else {
2825 					ipp->ipp_tclass = *i1;
2826 				}
2827 				ipp->ipp_fields |= IPPF_TCLASS;
2828 			}
2829 			if (sticky) {
2830 				error = icmp_build_hdrs(icmp);
2831 				if (error != 0)
2832 					return (error);
2833 			}
2834 			break;
2835 		case IPV6_NEXTHOP:
2836 			/*
2837 			 * IP will verify that the nexthop is reachable
2838 			 * and fail for sticky options.
2839 			 */
2840 			if (inlen != 0 && inlen != sizeof (sin6_t)) {
2841 				return (EINVAL);
2842 			}
2843 			if (checkonly)
2844 				break;
2845 
2846 			if (inlen == 0) {
2847 				ipp->ipp_fields &= ~IPPF_NEXTHOP;
2848 				ipp->ipp_sticky_ignored |= IPPF_NEXTHOP;
2849 			} else {
2850 				sin6_t *sin6 = (sin6_t *)invalp;
2851 
2852 				if (sin6->sin6_family != AF_INET6) {
2853 					return (EAFNOSUPPORT);
2854 				}
2855 				if (IN6_IS_ADDR_V4MAPPED(&sin6->sin6_addr)) {
2856 					return (EADDRNOTAVAIL);
2857 				}
2858 				ipp->ipp_nexthop = sin6->sin6_addr;
2859 				if (!IN6_IS_ADDR_UNSPECIFIED(
2860 				    &ipp->ipp_nexthop))
2861 					ipp->ipp_fields |= IPPF_NEXTHOP;
2862 				else
2863 					ipp->ipp_fields &= ~IPPF_NEXTHOP;
2864 			}
2865 			if (sticky) {
2866 				error = icmp_build_hdrs(icmp);
2867 				if (error != 0)
2868 					return (error);
2869 				PASS_OPT_TO_IP(connp);
2870 			}
2871 			break;
2872 		case IPV6_HOPOPTS: {
2873 			ip6_hbh_t *hopts = (ip6_hbh_t *)invalp;
2874 			/*
2875 			 * Sanity checks - minimum size, size a multiple of
2876 			 * eight bytes, and matching size passed in.
2877 			 */
2878 			if (inlen != 0 &&
2879 			    inlen != (8 * (hopts->ip6h_len + 1))) {
2880 				return (EINVAL);
2881 			}
2882 
2883 			if (checkonly)
2884 				break;
2885 			error = optcom_pkt_set(invalp, inlen, sticky,
2886 			    (uchar_t **)&ipp->ipp_hopopts,
2887 			    &ipp->ipp_hopoptslen,
2888 			    sticky ? icmp->icmp_label_len_v6 : 0);
2889 			if (error != 0)
2890 				return (error);
2891 			if (ipp->ipp_hopoptslen == 0) {
2892 				ipp->ipp_fields &= ~IPPF_HOPOPTS;
2893 				ipp->ipp_sticky_ignored |= IPPF_HOPOPTS;
2894 			} else {
2895 				ipp->ipp_fields |= IPPF_HOPOPTS;
2896 			}
2897 			if (sticky) {
2898 				error = icmp_build_hdrs(icmp);
2899 				if (error != 0)
2900 					return (error);
2901 			}
2902 			break;
2903 		}
2904 		case IPV6_RTHDRDSTOPTS: {
2905 			ip6_dest_t *dopts = (ip6_dest_t *)invalp;
2906 
2907 			/*
2908 			 * Sanity checks - minimum size, size a multiple of
2909 			 * eight bytes, and matching size passed in.
2910 			 */
2911 			if (inlen != 0 &&
2912 			    inlen != (8 * (dopts->ip6d_len + 1)))
2913 				return (EINVAL);
2914 
2915 			if (checkonly)
2916 				break;
2917 
2918 			if (inlen == 0) {
2919 				if (sticky &&
2920 				    (ipp->ipp_fields & IPPF_RTDSTOPTS) != 0) {
2921 					kmem_free(ipp->ipp_rtdstopts,
2922 					    ipp->ipp_rtdstoptslen);
2923 					ipp->ipp_rtdstopts = NULL;
2924 					ipp->ipp_rtdstoptslen = 0;
2925 				}
2926 				ipp->ipp_fields &= ~IPPF_RTDSTOPTS;
2927 				ipp->ipp_sticky_ignored |= IPPF_RTDSTOPTS;
2928 			} else {
2929 				error = optcom_pkt_set(invalp, inlen, sticky,
2930 				    (uchar_t **)&ipp->ipp_rtdstopts,
2931 				    &ipp->ipp_rtdstoptslen, 0);
2932 				if (error != 0)
2933 					return (error);
2934 				ipp->ipp_fields |= IPPF_RTDSTOPTS;
2935 			}
2936 			if (sticky) {
2937 				error = icmp_build_hdrs(icmp);
2938 				if (error != 0)
2939 					return (error);
2940 			}
2941 			break;
2942 		}
2943 		case IPV6_DSTOPTS: {
2944 			ip6_dest_t *dopts = (ip6_dest_t *)invalp;
2945 
2946 			/*
2947 			 * Sanity checks - minimum size, size a multiple of
2948 			 * eight bytes, and matching size passed in.
2949 			 */
2950 			if (inlen != 0 &&
2951 			    inlen != (8 * (dopts->ip6d_len + 1)))
2952 				return (EINVAL);
2953 
2954 			if (checkonly)
2955 				break;
2956 
2957 			if (inlen == 0) {
2958 				if (sticky &&
2959 				    (ipp->ipp_fields & IPPF_DSTOPTS) != 0) {
2960 					kmem_free(ipp->ipp_dstopts,
2961 					    ipp->ipp_dstoptslen);
2962 					ipp->ipp_dstopts = NULL;
2963 					ipp->ipp_dstoptslen = 0;
2964 				}
2965 				ipp->ipp_fields &= ~IPPF_DSTOPTS;
2966 				ipp->ipp_sticky_ignored |= IPPF_DSTOPTS;
2967 			} else {
2968 				error = optcom_pkt_set(invalp, inlen, sticky,
2969 				    (uchar_t **)&ipp->ipp_dstopts,
2970 				    &ipp->ipp_dstoptslen, 0);
2971 				if (error != 0)
2972 					return (error);
2973 				ipp->ipp_fields |= IPPF_DSTOPTS;
2974 			}
2975 			if (sticky) {
2976 				error = icmp_build_hdrs(icmp);
2977 				if (error != 0)
2978 					return (error);
2979 			}
2980 			break;
2981 		}
2982 		case IPV6_RTHDR: {
2983 			ip6_rthdr_t *rt = (ip6_rthdr_t *)invalp;
2984 
2985 			/*
2986 			 * Sanity checks - minimum size, size a multiple of
2987 			 * eight bytes, and matching size passed in.
2988 			 */
2989 			if (inlen != 0 &&
2990 			    inlen != (8 * (rt->ip6r_len + 1)))
2991 				return (EINVAL);
2992 
2993 			if (checkonly)
2994 				break;
2995 
2996 			if (inlen == 0) {
2997 				if (sticky &&
2998 				    (ipp->ipp_fields & IPPF_RTHDR) != 0) {
2999 					kmem_free(ipp->ipp_rthdr,
3000 					    ipp->ipp_rthdrlen);
3001 					ipp->ipp_rthdr = NULL;
3002 					ipp->ipp_rthdrlen = 0;
3003 				}
3004 				ipp->ipp_fields &= ~IPPF_RTHDR;
3005 				ipp->ipp_sticky_ignored |= IPPF_RTHDR;
3006 			} else {
3007 				error = optcom_pkt_set(invalp, inlen, sticky,
3008 				    (uchar_t **)&ipp->ipp_rthdr,
3009 				    &ipp->ipp_rthdrlen, 0);
3010 				if (error != 0)
3011 					return (error);
3012 				ipp->ipp_fields |= IPPF_RTHDR;
3013 			}
3014 			if (sticky) {
3015 				error = icmp_build_hdrs(icmp);
3016 				if (error != 0)
3017 					return (error);
3018 			}
3019 			break;
3020 		}
3021 
3022 		case IPV6_DONTFRAG:
3023 			if (checkonly)
3024 				break;
3025 
3026 			if (onoff) {
3027 				ipp->ipp_fields |= IPPF_DONTFRAG;
3028 			} else {
3029 				ipp->ipp_fields &= ~IPPF_DONTFRAG;
3030 			}
3031 			break;
3032 
3033 		case IPV6_USE_MIN_MTU:
3034 			if (inlen != sizeof (int))
3035 				return (EINVAL);
3036 
3037 			if (*i1 < -1 || *i1 > 1)
3038 				return (EINVAL);
3039 
3040 			if (checkonly)
3041 				break;
3042 
3043 			ipp->ipp_fields |= IPPF_USE_MIN_MTU;
3044 			ipp->ipp_use_min_mtu = *i1;
3045 			break;
3046 
3047 		/*
3048 		 * This option can't be set.  Its only returned via
3049 		 * getsockopt() or ancillary data.
3050 		 */
3051 		case IPV6_PATHMTU:
3052 			return (EINVAL);
3053 
3054 		case IPV6_SEC_OPT:
3055 		case IPV6_SRC_PREFERENCES:
3056 		case IPV6_V6ONLY:
3057 			/* Handled at IP level */
3058 			return (-EINVAL);
3059 		default:
3060 			*outlenp = 0;
3061 			return (EINVAL);
3062 		}
3063 		break;
3064 	}		/* end IPPROTO_IPV6 */
3065 
3066 	case IPPROTO_ICMPV6:
3067 		/*
3068 		 * Only allow IPv6 option processing on IPv6 sockets.
3069 		 */
3070 		if (icmp->icmp_family != AF_INET6) {
3071 			*outlenp = 0;
3072 			return (ENOPROTOOPT);
3073 		}
3074 		if (icmp->icmp_proto != IPPROTO_ICMPV6) {
3075 			*outlenp = 0;
3076 			return (ENOPROTOOPT);
3077 		}
3078 		switch (name) {
3079 		case ICMP6_FILTER:
3080 			if (!checkonly) {
3081 				if ((inlen != 0) &&
3082 				    (inlen != sizeof (icmp6_filter_t)))
3083 					return (EINVAL);
3084 
3085 				if (inlen == 0) {
3086 					if (icmp->icmp_filter != NULL) {
3087 						kmem_free(icmp->icmp_filter,
3088 						    sizeof (icmp6_filter_t));
3089 						icmp->icmp_filter = NULL;
3090 					}
3091 				} else {
3092 					if (icmp->icmp_filter == NULL) {
3093 						icmp->icmp_filter = kmem_alloc(
3094 						    sizeof (icmp6_filter_t),
3095 						    KM_NOSLEEP);
3096 						if (icmp->icmp_filter == NULL) {
3097 							*outlenp = 0;
3098 							return (ENOBUFS);
3099 						}
3100 					}
3101 					(void) bcopy(invalp, icmp->icmp_filter,
3102 					    inlen);
3103 				}
3104 			}
3105 			break;
3106 
3107 		default:
3108 			*outlenp = 0;
3109 			return (EINVAL);
3110 		}
3111 		break;
3112 	default:
3113 		*outlenp = 0;
3114 		return (EINVAL);
3115 	}
3116 	/*
3117 	 * Common case of OK return with outval same as inval.
3118 	 */
3119 	if (invalp != outvalp) {
3120 		/* don't trust bcopy for identical src/dst */
3121 		(void) bcopy(invalp, outvalp, inlen);
3122 	}
3123 	*outlenp = inlen;
3124 	return (0);
3125 }
3126 
3127 /* This routine sets socket options. */
3128 /* ARGSUSED */
3129 int
3130 icmp_opt_set(conn_t *connp, uint_t optset_context, int level, int name,
3131     uint_t inlen, uchar_t *invalp, uint_t *outlenp, uchar_t *outvalp,
3132     void *thisdg_attrs, cred_t *cr)
3133 {
3134 	boolean_t checkonly;
3135 	int	error;
3136 
3137 	error = 0;
3138 	switch (optset_context) {
3139 	case SETFN_OPTCOM_CHECKONLY:
3140 		checkonly = B_TRUE;
3141 		/*
3142 		 * Note: Implies T_CHECK semantics for T_OPTCOM_REQ
3143 		 * inlen != 0 implies value supplied and
3144 		 * 	we have to "pretend" to set it.
3145 		 * inlen == 0 implies that there is no
3146 		 * 	value part in T_CHECK request and just validation
3147 		 * done elsewhere should be enough, we just return here.
3148 		 */
3149 		if (inlen == 0) {
3150 			*outlenp = 0;
3151 			error = 0;
3152 			goto done;
3153 		}
3154 		break;
3155 	case SETFN_OPTCOM_NEGOTIATE:
3156 		checkonly = B_FALSE;
3157 		break;
3158 	case SETFN_UD_NEGOTIATE:
3159 	case SETFN_CONN_NEGOTIATE:
3160 		checkonly = B_FALSE;
3161 		/*
3162 		 * Negotiating local and "association-related" options
3163 		 * through T_UNITDATA_REQ.
3164 		 *
3165 		 * Following routine can filter out ones we do not
3166 		 * want to be "set" this way.
3167 		 */
3168 		if (!icmp_opt_allow_udr_set(level, name)) {
3169 			*outlenp = 0;
3170 			error = EINVAL;
3171 			goto done;
3172 		}
3173 		break;
3174 	default:
3175 		/*
3176 		 * We should never get here
3177 		 */
3178 		*outlenp = 0;
3179 		error = EINVAL;
3180 		goto done;
3181 	}
3182 
3183 	ASSERT((optset_context != SETFN_OPTCOM_CHECKONLY) ||
3184 	    (optset_context == SETFN_OPTCOM_CHECKONLY && inlen != 0));
3185 	error = icmp_do_opt_set(connp, level, name, inlen, invalp, outlenp,
3186 	    outvalp, cr, thisdg_attrs, checkonly);
3187 
3188 done:
3189 	return (error);
3190 }
3191 
3192 /* This routine sets socket options. */
3193 /* ARGSUSED */
3194 int
3195 icmp_tpi_opt_set(queue_t *q, uint_t optset_context, int level, int name,
3196     uint_t inlen, uchar_t *invalp, uint_t *outlenp, uchar_t *outvalp,
3197     void *thisdg_attrs, cred_t *cr, mblk_t *mblk)
3198 {
3199 	conn_t	*connp =  Q_TO_CONN(q);
3200 	icmp_t	*icmp;
3201 	int error;
3202 
3203 	icmp = connp->conn_icmp;
3204 	rw_enter(&icmp->icmp_rwlock, RW_WRITER);
3205 	error = icmp_opt_set(connp, optset_context, level, name, inlen, invalp,
3206 	    outlenp, outvalp, thisdg_attrs, cr);
3207 	rw_exit(&icmp->icmp_rwlock);
3208 	return (error);
3209 }
3210 
3211 /*
3212  * Update icmp_sticky_hdrs based on icmp_sticky_ipp, icmp_v6src, icmp_ttl,
3213  * icmp_proto, icmp_raw_checksum and icmp_no_tp_cksum.
3214  * The headers include ip6i_t (if needed), ip6_t, and any sticky extension
3215  * headers.
3216  * Returns failure if can't allocate memory.
3217  */
3218 static int
3219 icmp_build_hdrs(icmp_t *icmp)
3220 {
3221 	icmp_stack_t *is = icmp->icmp_is;
3222 	uchar_t	*hdrs;
3223 	uint_t	hdrs_len;
3224 	ip6_t	*ip6h;
3225 	ip6i_t	*ip6i;
3226 	ip6_pkt_t *ipp = &icmp->icmp_sticky_ipp;
3227 
3228 	ASSERT(RW_WRITE_HELD(&icmp->icmp_rwlock));
3229 	hdrs_len = ip_total_hdrs_len_v6(ipp);
3230 	ASSERT(hdrs_len != 0);
3231 	if (hdrs_len != icmp->icmp_sticky_hdrs_len) {
3232 		/* Need to reallocate */
3233 		if (hdrs_len != 0) {
3234 			hdrs = kmem_alloc(hdrs_len, KM_NOSLEEP);
3235 			if (hdrs == NULL)
3236 				return (ENOMEM);
3237 		} else {
3238 			hdrs = NULL;
3239 		}
3240 		if (icmp->icmp_sticky_hdrs_len != 0) {
3241 			kmem_free(icmp->icmp_sticky_hdrs,
3242 			    icmp->icmp_sticky_hdrs_len);
3243 		}
3244 		icmp->icmp_sticky_hdrs = hdrs;
3245 		icmp->icmp_sticky_hdrs_len = hdrs_len;
3246 	}
3247 	ip_build_hdrs_v6(icmp->icmp_sticky_hdrs,
3248 	    icmp->icmp_sticky_hdrs_len, ipp, icmp->icmp_proto);
3249 
3250 	/* Set header fields not in ipp */
3251 	if (ipp->ipp_fields & IPPF_HAS_IP6I) {
3252 		ip6i = (ip6i_t *)icmp->icmp_sticky_hdrs;
3253 		ip6h = (ip6_t *)&ip6i[1];
3254 
3255 		if (ipp->ipp_fields & IPPF_RAW_CKSUM) {
3256 			ip6i->ip6i_flags |= IP6I_RAW_CHECKSUM;
3257 			ip6i->ip6i_checksum_off = icmp->icmp_checksum_off;
3258 		}
3259 		if (ipp->ipp_fields & IPPF_NO_CKSUM) {
3260 			ip6i->ip6i_flags |= IP6I_NO_ULP_CKSUM;
3261 		}
3262 	} else {
3263 		ip6h = (ip6_t *)icmp->icmp_sticky_hdrs;
3264 	}
3265 
3266 	if (!(ipp->ipp_fields & IPPF_ADDR))
3267 		ip6h->ip6_src = icmp->icmp_v6src;
3268 
3269 	/* Try to get everything in a single mblk */
3270 	if (hdrs_len > icmp->icmp_max_hdr_len) {
3271 		icmp->icmp_max_hdr_len = hdrs_len;
3272 		rw_exit(&icmp->icmp_rwlock);
3273 		(void) proto_set_tx_wroff(icmp->icmp_connp->conn_rq,
3274 		    icmp->icmp_connp,
3275 		    icmp->icmp_max_hdr_len + is->is_wroff_extra);
3276 		rw_enter(&icmp->icmp_rwlock, RW_WRITER);
3277 	}
3278 	return (0);
3279 }
3280 
3281 /*
3282  * This routine retrieves the value of an ND variable in a icmpparam_t
3283  * structure.  It is called through nd_getset when a user reads the
3284  * variable.
3285  */
3286 /* ARGSUSED */
3287 static int
3288 icmp_param_get(queue_t *q, mblk_t *mp, caddr_t cp, cred_t *cr)
3289 {
3290 	icmpparam_t	*icmppa = (icmpparam_t *)cp;
3291 
3292 	(void) mi_mpprintf(mp, "%d", icmppa->icmp_param_value);
3293 	return (0);
3294 }
3295 
3296 /*
3297  * Walk through the param array specified registering each element with the
3298  * named dispatch (ND) handler.
3299  */
3300 static boolean_t
3301 icmp_param_register(IDP *ndp, icmpparam_t *icmppa, int cnt)
3302 {
3303 	for (; cnt-- > 0; icmppa++) {
3304 		if (icmppa->icmp_param_name && icmppa->icmp_param_name[0]) {
3305 			if (!nd_load(ndp, icmppa->icmp_param_name,
3306 			    icmp_param_get, icmp_param_set,
3307 			    (caddr_t)icmppa)) {
3308 				nd_free(ndp);
3309 				return (B_FALSE);
3310 			}
3311 		}
3312 	}
3313 	if (!nd_load(ndp, "icmp_status", icmp_status_report, NULL,
3314 	    NULL)) {
3315 		nd_free(ndp);
3316 		return (B_FALSE);
3317 	}
3318 	return (B_TRUE);
3319 }
3320 
3321 /* This routine sets an ND variable in a icmpparam_t structure. */
3322 /* ARGSUSED */
3323 static int
3324 icmp_param_set(queue_t *q, mblk_t *mp, char *value, caddr_t cp, cred_t *cr)
3325 {
3326 	long		new_value;
3327 	icmpparam_t	*icmppa = (icmpparam_t *)cp;
3328 
3329 	/*
3330 	 * Fail the request if the new value does not lie within the
3331 	 * required bounds.
3332 	 */
3333 	if (ddi_strtol(value, NULL, 10, &new_value) != 0 ||
3334 	    new_value < icmppa->icmp_param_min ||
3335 	    new_value > icmppa->icmp_param_max) {
3336 		return (EINVAL);
3337 	}
3338 	/* Set the new value */
3339 	icmppa->icmp_param_value = new_value;
3340 	return (0);
3341 }
3342 static void
3343 icmp_queue_fallback(icmp_t *icmp, mblk_t *mp)
3344 {
3345 	ASSERT(MUTEX_HELD(&icmp->icmp_recv_lock));
3346 	if (IPCL_IS_NONSTR(icmp->icmp_connp)) {
3347 		/*
3348 		 * fallback has started but messages have not been moved yet
3349 		 */
3350 		if (icmp->icmp_fallback_queue_head == NULL) {
3351 			ASSERT(icmp->icmp_fallback_queue_tail == NULL);
3352 			icmp->icmp_fallback_queue_head = mp;
3353 			icmp->icmp_fallback_queue_tail = mp;
3354 		} else {
3355 			ASSERT(icmp->icmp_fallback_queue_tail != NULL);
3356 			icmp->icmp_fallback_queue_tail->b_next = mp;
3357 			icmp->icmp_fallback_queue_tail = mp;
3358 		}
3359 		mutex_exit(&icmp->icmp_recv_lock);
3360 	} else {
3361 		/*
3362 		 * no more fallbacks possible, ok to drop lock.
3363 		 */
3364 		mutex_exit(&icmp->icmp_recv_lock);
3365 		putnext(icmp->icmp_connp->conn_rq, mp);
3366 	}
3367 }
3368 
3369 /*ARGSUSED2*/
3370 static void
3371 icmp_input(void *arg1, mblk_t *mp, void *arg2)
3372 {
3373 	conn_t *connp = (conn_t *)arg1;
3374 	struct T_unitdata_ind	*tudi;
3375 	uchar_t			*rptr;
3376 	icmp_t			*icmp;
3377 	icmp_stack_t		*is;
3378 	sin_t			*sin;
3379 	sin6_t			*sin6;
3380 	ip6_t			*ip6h;
3381 	ip6i_t			*ip6i;
3382 	mblk_t			*mp1;
3383 	int			hdr_len;
3384 	ipha_t			*ipha;
3385 	int			udi_size;	/* Size of T_unitdata_ind */
3386 	uint_t			ipvers;
3387 	ip6_pkt_t		ipp;
3388 	uint8_t			nexthdr;
3389 	ip_pktinfo_t		*pinfo = NULL;
3390 	mblk_t			*options_mp = NULL;
3391 	uint_t			icmp_opt = 0;
3392 	boolean_t		icmp_ipv6_recvhoplimit = B_FALSE;
3393 	uint_t			hopstrip;
3394 	int			error;
3395 
3396 	ASSERT(connp->conn_flags & IPCL_RAWIPCONN);
3397 
3398 	icmp = connp->conn_icmp;
3399 	is = icmp->icmp_is;
3400 	rptr = mp->b_rptr;
3401 	ASSERT(DB_TYPE(mp) == M_DATA || DB_TYPE(mp) == M_CTL);
3402 	ASSERT(OK_32PTR(rptr));
3403 
3404 	/*
3405 	 * IP should have prepended the options data in an M_CTL
3406 	 * Check M_CTL "type" to make sure are not here bcos of
3407 	 * a valid ICMP message
3408 	 */
3409 	if (DB_TYPE(mp) == M_CTL) {
3410 		/*
3411 		 * FIXME: does IP still do this?
3412 		 * IP sends up the IPSEC_IN message for handling IPSEC
3413 		 * policy at the TCP level. We don't need it here.
3414 		 */
3415 		if (*(uint32_t *)(mp->b_rptr) == IPSEC_IN) {
3416 			mp1 = mp->b_cont;
3417 			freeb(mp);
3418 			mp = mp1;
3419 			rptr = mp->b_rptr;
3420 		} else if (MBLKL(mp) == sizeof (ip_pktinfo_t) &&
3421 		    ((ip_pktinfo_t *)mp->b_rptr)->ip_pkt_ulp_type ==
3422 		    IN_PKTINFO) {
3423 			/*
3424 			 * IP_RECVIF or IP_RECVSLLA or IPF_RECVADDR information
3425 			 * has been prepended to the packet by IP. We need to
3426 			 * extract the mblk and adjust the rptr
3427 			 */
3428 			pinfo = (ip_pktinfo_t *)mp->b_rptr;
3429 			options_mp = mp;
3430 			mp = mp->b_cont;
3431 			rptr = mp->b_rptr;
3432 		} else {
3433 			/*
3434 			 * ICMP messages.
3435 			 */
3436 			icmp_icmp_error(connp, mp);
3437 			return;
3438 		}
3439 	}
3440 
3441 	/*
3442 	 * Discard message if it is misaligned or smaller than the IP header.
3443 	 */
3444 	if (!OK_32PTR(rptr) || (mp->b_wptr - rptr) < sizeof (ipha_t)) {
3445 		freemsg(mp);
3446 		if (options_mp != NULL)
3447 			freeb(options_mp);
3448 		BUMP_MIB(&is->is_rawip_mib, rawipInErrors);
3449 		return;
3450 	}
3451 	ipvers = IPH_HDR_VERSION((ipha_t *)rptr);
3452 
3453 	/* Handle M_DATA messages containing IP packets messages */
3454 	if (ipvers == IPV4_VERSION) {
3455 		/*
3456 		 * Special case where IP attaches
3457 		 * the IRE needs to be handled so that we don't send up
3458 		 * IRE to the user land.
3459 		 */
3460 		ipha = (ipha_t *)rptr;
3461 		hdr_len = IPH_HDR_LENGTH(ipha);
3462 
3463 		if (ipha->ipha_protocol == IPPROTO_TCP) {
3464 			tcph_t *tcph = (tcph_t *)&mp->b_rptr[hdr_len];
3465 
3466 			if (((tcph->th_flags[0] & (TH_SYN|TH_ACK)) ==
3467 			    TH_SYN) && mp->b_cont != NULL) {
3468 				mp1 = mp->b_cont;
3469 				if (mp1->b_datap->db_type == IRE_DB_TYPE) {
3470 					freeb(mp1);
3471 					mp->b_cont = NULL;
3472 				}
3473 			}
3474 		}
3475 		if (is->is_bsd_compat) {
3476 			ushort_t len;
3477 			len = ntohs(ipha->ipha_length);
3478 
3479 			if (mp->b_datap->db_ref > 1) {
3480 				/*
3481 				 * Allocate a new IP header so that we can
3482 				 * modify ipha_length.
3483 				 */
3484 				mblk_t	*mp1;
3485 
3486 				mp1 = allocb(hdr_len, BPRI_MED);
3487 				if (!mp1) {
3488 					freemsg(mp);
3489 					if (options_mp != NULL)
3490 						freeb(options_mp);
3491 					BUMP_MIB(&is->is_rawip_mib,
3492 					    rawipInErrors);
3493 					return;
3494 				}
3495 				bcopy(rptr, mp1->b_rptr, hdr_len);
3496 				mp->b_rptr = rptr + hdr_len;
3497 				rptr = mp1->b_rptr;
3498 				ipha = (ipha_t *)rptr;
3499 				mp1->b_cont = mp;
3500 				mp1->b_wptr = rptr + hdr_len;
3501 				mp = mp1;
3502 			}
3503 			len -= hdr_len;
3504 			ipha->ipha_length = htons(len);
3505 		}
3506 	}
3507 
3508 	/*
3509 	 * This is the inbound data path.  Packets are passed upstream as
3510 	 * T_UNITDATA_IND messages with full IP headers still attached.
3511 	 */
3512 	if (icmp->icmp_family == AF_INET) {
3513 		ASSERT(ipvers == IPV4_VERSION);
3514 		udi_size =  sizeof (struct T_unitdata_ind) + sizeof (sin_t);
3515 		if (icmp->icmp_recvif && (pinfo != NULL) &&
3516 		    (pinfo->ip_pkt_flags & IPF_RECVIF)) {
3517 			udi_size += sizeof (struct T_opthdr) +
3518 			    sizeof (uint_t);
3519 		}
3520 
3521 		if (icmp->icmp_ip_recvpktinfo && (pinfo != NULL) &&
3522 		    (pinfo->ip_pkt_flags & IPF_RECVADDR)) {
3523 			udi_size += sizeof (struct T_opthdr) +
3524 			    sizeof (struct in_pktinfo);
3525 		}
3526 
3527 		/*
3528 		 * If SO_TIMESTAMP is set allocate the appropriate sized
3529 		 * buffer. Since gethrestime() expects a pointer aligned
3530 		 * argument, we allocate space necessary for extra
3531 		 * alignment (even though it might not be used).
3532 		 */
3533 		if (icmp->icmp_timestamp) {
3534 			udi_size += sizeof (struct T_opthdr) +
3535 			    sizeof (timestruc_t) + _POINTER_ALIGNMENT;
3536 		}
3537 		mp1 = allocb(udi_size, BPRI_MED);
3538 		if (mp1 == NULL) {
3539 			freemsg(mp);
3540 			if (options_mp != NULL)
3541 				freeb(options_mp);
3542 			BUMP_MIB(&is->is_rawip_mib, rawipInErrors);
3543 			return;
3544 		}
3545 		mp1->b_cont = mp;
3546 		mp = mp1;
3547 		tudi = (struct T_unitdata_ind *)mp->b_rptr;
3548 		mp->b_datap->db_type = M_PROTO;
3549 		mp->b_wptr = (uchar_t *)tudi + udi_size;
3550 		tudi->PRIM_type = T_UNITDATA_IND;
3551 		tudi->SRC_length = sizeof (sin_t);
3552 		tudi->SRC_offset = sizeof (struct T_unitdata_ind);
3553 		sin = (sin_t *)&tudi[1];
3554 		*sin = sin_null;
3555 		sin->sin_family = AF_INET;
3556 		sin->sin_addr.s_addr = ipha->ipha_src;
3557 		tudi->OPT_offset =  sizeof (struct T_unitdata_ind) +
3558 		    sizeof (sin_t);
3559 		udi_size -= (sizeof (struct T_unitdata_ind) + sizeof (sin_t));
3560 		tudi->OPT_length = udi_size;
3561 
3562 		/*
3563 		 * Add options if IP_RECVIF is set
3564 		 */
3565 		if (udi_size != 0) {
3566 			char *dstopt;
3567 
3568 			dstopt = (char *)&sin[1];
3569 			if (icmp->icmp_recvif && (pinfo != NULL) &&
3570 			    (pinfo->ip_pkt_flags & IPF_RECVIF)) {
3571 
3572 				struct T_opthdr *toh;
3573 				uint_t		*dstptr;
3574 
3575 				toh = (struct T_opthdr *)dstopt;
3576 				toh->level = IPPROTO_IP;
3577 				toh->name = IP_RECVIF;
3578 				toh->len = sizeof (struct T_opthdr) +
3579 				    sizeof (uint_t);
3580 				toh->status = 0;
3581 				dstopt += sizeof (struct T_opthdr);
3582 				dstptr = (uint_t *)dstopt;
3583 				*dstptr = pinfo->ip_pkt_ifindex;
3584 				dstopt += sizeof (uint_t);
3585 				udi_size -= toh->len;
3586 			}
3587 			if (icmp->icmp_timestamp) {
3588 				struct	T_opthdr *toh;
3589 
3590 				toh = (struct T_opthdr *)dstopt;
3591 				toh->level = SOL_SOCKET;
3592 				toh->name = SCM_TIMESTAMP;
3593 				toh->len = sizeof (struct T_opthdr) +
3594 				    sizeof (timestruc_t) + _POINTER_ALIGNMENT;
3595 				toh->status = 0;
3596 				dstopt += sizeof (struct T_opthdr);
3597 				/* Align for gethrestime() */
3598 				dstopt = (char *)P2ROUNDUP((intptr_t)dstopt,
3599 				    sizeof (intptr_t));
3600 				gethrestime((timestruc_t *)dstopt);
3601 				dstopt = (char *)toh + toh->len;
3602 				udi_size -= toh->len;
3603 			}
3604 			if (icmp->icmp_ip_recvpktinfo && (pinfo != NULL) &&
3605 			    (pinfo->ip_pkt_flags & IPF_RECVADDR)) {
3606 				struct	T_opthdr *toh;
3607 				struct	in_pktinfo *pktinfop;
3608 
3609 				toh = (struct T_opthdr *)dstopt;
3610 				toh->level = IPPROTO_IP;
3611 				toh->name = IP_PKTINFO;
3612 				toh->len = sizeof (struct T_opthdr) +
3613 				    sizeof (in_pktinfo_t);
3614 				toh->status = 0;
3615 				dstopt += sizeof (struct T_opthdr);
3616 				pktinfop = (struct in_pktinfo *)dstopt;
3617 				pktinfop->ipi_ifindex = pinfo->ip_pkt_ifindex;
3618 				pktinfop->ipi_spec_dst =
3619 				    pinfo->ip_pkt_match_addr;
3620 
3621 				pktinfop->ipi_addr.s_addr = ipha->ipha_dst;
3622 
3623 				dstopt += sizeof (struct in_pktinfo);
3624 				udi_size -= toh->len;
3625 			}
3626 
3627 			/* Consumed all of allocated space */
3628 			ASSERT(udi_size == 0);
3629 		}
3630 
3631 		if (options_mp != NULL)
3632 			freeb(options_mp);
3633 
3634 		BUMP_MIB(&is->is_rawip_mib, rawipInDatagrams);
3635 		goto deliver;
3636 	}
3637 
3638 	/*
3639 	 * We don't need options_mp in the IPv6 path.
3640 	 */
3641 	if (options_mp != NULL) {
3642 		freeb(options_mp);
3643 		options_mp = NULL;
3644 	}
3645 
3646 	/*
3647 	 * Discard message if it is smaller than the IPv6 header
3648 	 * or if the header is malformed.
3649 	 */
3650 	if ((mp->b_wptr - rptr) < sizeof (ip6_t) ||
3651 	    IPH_HDR_VERSION((ipha_t *)rptr) != IPV6_VERSION ||
3652 	    icmp->icmp_family != AF_INET6) {
3653 		freemsg(mp);
3654 		BUMP_MIB(&is->is_rawip_mib, rawipInErrors);
3655 		return;
3656 	}
3657 
3658 	/* Initialize */
3659 	ipp.ipp_fields = 0;
3660 	hopstrip = 0;
3661 
3662 	ip6h = (ip6_t *)rptr;
3663 	/*
3664 	 * Call on ip_find_hdr_v6 which gets the total hdr len
3665 	 * as well as individual lenghts of ext hdrs (and ptrs to
3666 	 * them).
3667 	 */
3668 	if (ip6h->ip6_nxt != icmp->icmp_proto) {
3669 		/* Look for ifindex information */
3670 		if (ip6h->ip6_nxt == IPPROTO_RAW) {
3671 			ip6i = (ip6i_t *)ip6h;
3672 			if (ip6i->ip6i_flags & IP6I_IFINDEX) {
3673 				ASSERT(ip6i->ip6i_ifindex != 0);
3674 				ipp.ipp_fields |= IPPF_IFINDEX;
3675 				ipp.ipp_ifindex = ip6i->ip6i_ifindex;
3676 			}
3677 			rptr = (uchar_t *)&ip6i[1];
3678 			mp->b_rptr = rptr;
3679 			if (rptr == mp->b_wptr) {
3680 				mp1 = mp->b_cont;
3681 				freeb(mp);
3682 				mp = mp1;
3683 				rptr = mp->b_rptr;
3684 			}
3685 			ASSERT(mp->b_wptr - rptr >= IPV6_HDR_LEN);
3686 			ip6h = (ip6_t *)rptr;
3687 		}
3688 		hdr_len = ip_find_hdr_v6(mp, ip6h, &ipp, &nexthdr);
3689 
3690 		/*
3691 		 * We need to lie a bit to the user because users inside
3692 		 * labeled compartments should not see their own labels.  We
3693 		 * assume that in all other respects IP has checked the label,
3694 		 * and that the label is always first among the options.  (If
3695 		 * it's not first, then this code won't see it, and the option
3696 		 * will be passed along to the user.)
3697 		 *
3698 		 * If we had multilevel ICMP sockets, then the following code
3699 		 * should be skipped for them to allow the user to see the
3700 		 * label.
3701 		 *
3702 		 * Alignment restrictions in the definition of IP options
3703 		 * (namely, the requirement that the 4-octet DOI goes on a
3704 		 * 4-octet boundary) mean that we know exactly where the option
3705 		 * should start, but we're lenient for other hosts.
3706 		 *
3707 		 * Note that there are no multilevel ICMP or raw IP sockets
3708 		 * yet, thus nobody ever sees the IP6OPT_LS option.
3709 		 */
3710 		if ((ipp.ipp_fields & IPPF_HOPOPTS) &&
3711 		    ipp.ipp_hopoptslen > 5 && is_system_labeled()) {
3712 			const uchar_t *ucp =
3713 			    (const uchar_t *)ipp.ipp_hopopts + 2;
3714 			int remlen = ipp.ipp_hopoptslen - 2;
3715 
3716 			while (remlen > 0) {
3717 				if (*ucp == IP6OPT_PAD1) {
3718 					remlen--;
3719 					ucp++;
3720 				} else if (*ucp == IP6OPT_PADN) {
3721 					remlen -= ucp[1] + 2;
3722 					ucp += ucp[1] + 2;
3723 				} else if (*ucp == ip6opt_ls) {
3724 					hopstrip = (ucp -
3725 					    (const uchar_t *)ipp.ipp_hopopts) +
3726 					    ucp[1] + 2;
3727 					hopstrip = (hopstrip + 7) & ~7;
3728 					break;
3729 				} else {
3730 					/* label option must be first */
3731 					break;
3732 				}
3733 			}
3734 		}
3735 	} else {
3736 		hdr_len = IPV6_HDR_LEN;
3737 		ip6i = NULL;
3738 		nexthdr = ip6h->ip6_nxt;
3739 	}
3740 	/*
3741 	 * One special case where IP attaches the IRE needs to
3742 	 * be handled so that we don't send up IRE to the user land.
3743 	 */
3744 	if (nexthdr == IPPROTO_TCP) {
3745 		tcph_t *tcph = (tcph_t *)&mp->b_rptr[hdr_len];
3746 
3747 		if (((tcph->th_flags[0] & (TH_SYN|TH_ACK)) == TH_SYN) &&
3748 		    mp->b_cont != NULL) {
3749 			mp1 = mp->b_cont;
3750 			if (mp1->b_datap->db_type == IRE_DB_TYPE) {
3751 				freeb(mp1);
3752 				mp->b_cont = NULL;
3753 			}
3754 		}
3755 	}
3756 	/*
3757 	 * Check a filter for ICMPv6 types if needed.
3758 	 * Verify raw checksums if needed.
3759 	 */
3760 	if (icmp->icmp_filter != NULL || icmp->icmp_raw_checksum) {
3761 		if (icmp->icmp_filter != NULL) {
3762 			int type;
3763 
3764 			/* Assumes that IP has done the pullupmsg */
3765 			type = mp->b_rptr[hdr_len];
3766 
3767 			ASSERT(mp->b_rptr + hdr_len <= mp->b_wptr);
3768 			if (ICMP6_FILTER_WILLBLOCK(type, icmp->icmp_filter)) {
3769 				freemsg(mp);
3770 				return;
3771 			}
3772 		} else {
3773 			/* Checksum */
3774 			uint16_t	*up;
3775 			uint32_t	sum;
3776 			int		remlen;
3777 
3778 			up = (uint16_t *)&ip6h->ip6_src;
3779 
3780 			remlen = msgdsize(mp) - hdr_len;
3781 			sum = htons(icmp->icmp_proto + remlen)
3782 			    + up[0] + up[1] + up[2] + up[3]
3783 			    + up[4] + up[5] + up[6] + up[7]
3784 			    + up[8] + up[9] + up[10] + up[11]
3785 			    + up[12] + up[13] + up[14] + up[15];
3786 			sum = (sum & 0xffff) + (sum >> 16);
3787 			sum = IP_CSUM(mp, hdr_len, sum);
3788 			if (sum != 0) {
3789 				/* IPv6 RAW checksum failed */
3790 				ip0dbg(("icmp_rput: RAW checksum "
3791 				    "failed %x\n", sum));
3792 				freemsg(mp);
3793 				BUMP_MIB(&is->is_rawip_mib,
3794 				    rawipInCksumErrs);
3795 				return;
3796 			}
3797 		}
3798 	}
3799 	/* Skip all the IPv6 headers per API */
3800 	mp->b_rptr += hdr_len;
3801 
3802 	udi_size = sizeof (struct T_unitdata_ind) + sizeof (sin6_t);
3803 
3804 	/*
3805 	 * We use local variables icmp_opt and icmp_ipv6_recvhoplimit to
3806 	 * maintain state information, instead of relying on icmp_t
3807 	 * structure, since there arent any locks protecting these members
3808 	 * and there is a window where there might be a race between a
3809 	 * thread setting options on the write side and a thread reading
3810 	 * these options on the read size.
3811 	 */
3812 	if (ipp.ipp_fields & (IPPF_HOPOPTS|IPPF_DSTOPTS|IPPF_RTDSTOPTS|
3813 	    IPPF_RTHDR|IPPF_IFINDEX)) {
3814 		if (icmp->icmp_ipv6_recvhopopts &&
3815 		    (ipp.ipp_fields & IPPF_HOPOPTS) &&
3816 		    ipp.ipp_hopoptslen > hopstrip) {
3817 			udi_size += sizeof (struct T_opthdr) +
3818 			    ipp.ipp_hopoptslen - hopstrip;
3819 			icmp_opt |= IPPF_HOPOPTS;
3820 		}
3821 		if ((icmp->icmp_ipv6_recvdstopts ||
3822 		    icmp->icmp_old_ipv6_recvdstopts) &&
3823 		    (ipp.ipp_fields & IPPF_DSTOPTS)) {
3824 			udi_size += sizeof (struct T_opthdr) +
3825 			    ipp.ipp_dstoptslen;
3826 			icmp_opt |= IPPF_DSTOPTS;
3827 		}
3828 		if (((icmp->icmp_ipv6_recvdstopts &&
3829 		    icmp->icmp_ipv6_recvrthdr &&
3830 		    (ipp.ipp_fields & IPPF_RTHDR)) ||
3831 		    icmp->icmp_ipv6_recvrtdstopts) &&
3832 		    (ipp.ipp_fields & IPPF_RTDSTOPTS)) {
3833 			udi_size += sizeof (struct T_opthdr) +
3834 			    ipp.ipp_rtdstoptslen;
3835 			icmp_opt |= IPPF_RTDSTOPTS;
3836 		}
3837 		if (icmp->icmp_ipv6_recvrthdr &&
3838 		    (ipp.ipp_fields & IPPF_RTHDR)) {
3839 			udi_size += sizeof (struct T_opthdr) +
3840 			    ipp.ipp_rthdrlen;
3841 			icmp_opt |= IPPF_RTHDR;
3842 		}
3843 		if (icmp->icmp_ip_recvpktinfo &&
3844 		    (ipp.ipp_fields & IPPF_IFINDEX)) {
3845 			udi_size += sizeof (struct T_opthdr) +
3846 			    sizeof (struct in6_pktinfo);
3847 			icmp_opt |= IPPF_IFINDEX;
3848 		}
3849 	}
3850 	if (icmp->icmp_ipv6_recvhoplimit) {
3851 		udi_size += sizeof (struct T_opthdr) + sizeof (int);
3852 		icmp_ipv6_recvhoplimit = B_TRUE;
3853 	}
3854 
3855 	if (icmp->icmp_ipv6_recvtclass)
3856 		udi_size += sizeof (struct T_opthdr) + sizeof (int);
3857 
3858 	/*
3859 	 * If SO_TIMESTAMP is set allocate the appropriate sized
3860 	 * buffer. Since gethrestime() expects a pointer aligned
3861 	 * argument, we allocate space necessary for extra
3862 	 * alignment (even though it might not be used).
3863 	 */
3864 	if (icmp->icmp_timestamp) {
3865 		udi_size += sizeof (struct T_opthdr) +
3866 		    sizeof (timestruc_t) + _POINTER_ALIGNMENT;
3867 	}
3868 
3869 	mp1 = allocb(udi_size, BPRI_MED);
3870 	if (mp1 == NULL) {
3871 		freemsg(mp);
3872 		BUMP_MIB(&is->is_rawip_mib, rawipInErrors);
3873 		return;
3874 	}
3875 	mp1->b_cont = mp;
3876 	mp = mp1;
3877 	mp->b_datap->db_type = M_PROTO;
3878 	tudi = (struct T_unitdata_ind *)mp->b_rptr;
3879 	mp->b_wptr = (uchar_t *)tudi + udi_size;
3880 	tudi->PRIM_type = T_UNITDATA_IND;
3881 	tudi->SRC_length = sizeof (sin6_t);
3882 	tudi->SRC_offset = sizeof (struct T_unitdata_ind);
3883 	tudi->OPT_offset = sizeof (struct T_unitdata_ind) + sizeof (sin6_t);
3884 	udi_size -= (sizeof (struct T_unitdata_ind) + sizeof (sin6_t));
3885 	tudi->OPT_length = udi_size;
3886 	sin6 = (sin6_t *)&tudi[1];
3887 	sin6->sin6_port = 0;
3888 	sin6->sin6_family = AF_INET6;
3889 
3890 	sin6->sin6_addr = ip6h->ip6_src;
3891 	/* No sin6_flowinfo per API */
3892 	sin6->sin6_flowinfo = 0;
3893 	/* For link-scope source pass up scope id */
3894 	if ((ipp.ipp_fields & IPPF_IFINDEX) &&
3895 	    IN6_IS_ADDR_LINKSCOPE(&ip6h->ip6_src))
3896 		sin6->sin6_scope_id = ipp.ipp_ifindex;
3897 	else
3898 		sin6->sin6_scope_id = 0;
3899 
3900 	sin6->__sin6_src_id = ip_srcid_find_addr(&ip6h->ip6_dst,
3901 	    icmp->icmp_zoneid, is->is_netstack);
3902 
3903 	if (udi_size != 0) {
3904 		uchar_t *dstopt;
3905 
3906 		dstopt = (uchar_t *)&sin6[1];
3907 		if (icmp_opt & IPPF_IFINDEX) {
3908 			struct T_opthdr *toh;
3909 			struct in6_pktinfo *pkti;
3910 
3911 			toh = (struct T_opthdr *)dstopt;
3912 			toh->level = IPPROTO_IPV6;
3913 			toh->name = IPV6_PKTINFO;
3914 			toh->len = sizeof (struct T_opthdr) +
3915 			    sizeof (*pkti);
3916 			toh->status = 0;
3917 			dstopt += sizeof (struct T_opthdr);
3918 			pkti = (struct in6_pktinfo *)dstopt;
3919 			pkti->ipi6_addr = ip6h->ip6_dst;
3920 			pkti->ipi6_ifindex = ipp.ipp_ifindex;
3921 			dstopt += sizeof (*pkti);
3922 			udi_size -= toh->len;
3923 		}
3924 		if (icmp_ipv6_recvhoplimit) {
3925 			struct T_opthdr *toh;
3926 
3927 			toh = (struct T_opthdr *)dstopt;
3928 			toh->level = IPPROTO_IPV6;
3929 			toh->name = IPV6_HOPLIMIT;
3930 			toh->len = sizeof (struct T_opthdr) +
3931 			    sizeof (uint_t);
3932 			toh->status = 0;
3933 			dstopt += sizeof (struct T_opthdr);
3934 			*(uint_t *)dstopt = ip6h->ip6_hops;
3935 			dstopt += sizeof (uint_t);
3936 			udi_size -= toh->len;
3937 		}
3938 		if (icmp->icmp_ipv6_recvtclass) {
3939 			struct T_opthdr *toh;
3940 
3941 			toh = (struct T_opthdr *)dstopt;
3942 			toh->level = IPPROTO_IPV6;
3943 			toh->name = IPV6_TCLASS;
3944 			toh->len = sizeof (struct T_opthdr) +
3945 			    sizeof (uint_t);
3946 			toh->status = 0;
3947 			dstopt += sizeof (struct T_opthdr);
3948 			*(uint_t *)dstopt = IPV6_FLOW_TCLASS(ip6h->ip6_flow);
3949 			dstopt += sizeof (uint_t);
3950 			udi_size -= toh->len;
3951 		}
3952 		if (icmp->icmp_timestamp) {
3953 			struct  T_opthdr *toh;
3954 
3955 			toh = (struct T_opthdr *)dstopt;
3956 			toh->level = SOL_SOCKET;
3957 			toh->name = SCM_TIMESTAMP;
3958 			toh->len = sizeof (struct T_opthdr) +
3959 			    sizeof (timestruc_t) + _POINTER_ALIGNMENT;
3960 			toh->status = 0;
3961 			dstopt += sizeof (struct T_opthdr);
3962 			/* Align for gethrestime() */
3963 			dstopt = (uchar_t *)P2ROUNDUP((intptr_t)dstopt,
3964 			    sizeof (intptr_t));
3965 			gethrestime((timestruc_t *)dstopt);
3966 			dstopt = (uchar_t *)toh + toh->len;
3967 			udi_size -= toh->len;
3968 		}
3969 
3970 		if (icmp_opt & IPPF_HOPOPTS) {
3971 			struct T_opthdr *toh;
3972 
3973 			toh = (struct T_opthdr *)dstopt;
3974 			toh->level = IPPROTO_IPV6;
3975 			toh->name = IPV6_HOPOPTS;
3976 			toh->len = sizeof (struct T_opthdr) +
3977 			    ipp.ipp_hopoptslen - hopstrip;
3978 			toh->status = 0;
3979 			dstopt += sizeof (struct T_opthdr);
3980 			bcopy((char *)ipp.ipp_hopopts + hopstrip, dstopt,
3981 			    ipp.ipp_hopoptslen - hopstrip);
3982 			if (hopstrip > 0) {
3983 				/* copy next header value and fake length */
3984 				dstopt[0] = ((uchar_t *)ipp.ipp_hopopts)[0];
3985 				dstopt[1] = ((uchar_t *)ipp.ipp_hopopts)[1] -
3986 				    hopstrip / 8;
3987 			}
3988 			dstopt += ipp.ipp_hopoptslen - hopstrip;
3989 			udi_size -= toh->len;
3990 		}
3991 		if (icmp_opt & IPPF_RTDSTOPTS) {
3992 			struct T_opthdr *toh;
3993 
3994 			toh = (struct T_opthdr *)dstopt;
3995 			toh->level = IPPROTO_IPV6;
3996 			toh->name = IPV6_DSTOPTS;
3997 			toh->len = sizeof (struct T_opthdr) +
3998 			    ipp.ipp_rtdstoptslen;
3999 			toh->status = 0;
4000 			dstopt += sizeof (struct T_opthdr);
4001 			bcopy(ipp.ipp_rtdstopts, dstopt,
4002 			    ipp.ipp_rtdstoptslen);
4003 			dstopt += ipp.ipp_rtdstoptslen;
4004 			udi_size -= toh->len;
4005 		}
4006 		if (icmp_opt & IPPF_RTHDR) {
4007 			struct T_opthdr *toh;
4008 
4009 			toh = (struct T_opthdr *)dstopt;
4010 			toh->level = IPPROTO_IPV6;
4011 			toh->name = IPV6_RTHDR;
4012 			toh->len = sizeof (struct T_opthdr) +
4013 			    ipp.ipp_rthdrlen;
4014 			toh->status = 0;
4015 			dstopt += sizeof (struct T_opthdr);
4016 			bcopy(ipp.ipp_rthdr, dstopt, ipp.ipp_rthdrlen);
4017 			dstopt += ipp.ipp_rthdrlen;
4018 			udi_size -= toh->len;
4019 		}
4020 		if (icmp_opt & IPPF_DSTOPTS) {
4021 			struct T_opthdr *toh;
4022 
4023 			toh = (struct T_opthdr *)dstopt;
4024 			toh->level = IPPROTO_IPV6;
4025 			toh->name = IPV6_DSTOPTS;
4026 			toh->len = sizeof (struct T_opthdr) +
4027 			    ipp.ipp_dstoptslen;
4028 			toh->status = 0;
4029 			dstopt += sizeof (struct T_opthdr);
4030 			bcopy(ipp.ipp_dstopts, dstopt,
4031 			    ipp.ipp_dstoptslen);
4032 			dstopt += ipp.ipp_dstoptslen;
4033 			udi_size -= toh->len;
4034 		}
4035 		/* Consumed all of allocated space */
4036 		ASSERT(udi_size == 0);
4037 	}
4038 	BUMP_MIB(&is->is_rawip_mib, rawipInDatagrams);
4039 
4040 deliver:
4041 	if (IPCL_IS_NONSTR(connp)) {
4042 		if ((*connp->conn_upcalls->su_recv)
4043 		    (connp->conn_upper_handle, mp, msgdsize(mp), 0, &error,
4044 		    NULL) < 0) {
4045 			mutex_enter(&icmp->icmp_recv_lock);
4046 			if (error == ENOSPC) {
4047 				/*
4048 				 * let's confirm while holding the lock
4049 				 */
4050 				if ((*connp->conn_upcalls->su_recv)
4051 				    (connp->conn_upper_handle, NULL, 0, 0,
4052 				    &error, NULL) < 0) {
4053 					if (error == ENOSPC) {
4054 						connp->conn_flow_cntrld =
4055 						    B_TRUE;
4056 					} else {
4057 						ASSERT(error == EOPNOTSUPP);
4058 					}
4059 				}
4060 				mutex_exit(&icmp->icmp_recv_lock);
4061 			} else {
4062 				ASSERT(error == EOPNOTSUPP);
4063 				icmp_queue_fallback(icmp, mp);
4064 			}
4065 		}
4066 	} else {
4067 		putnext(connp->conn_rq, mp);
4068 	}
4069 	ASSERT(MUTEX_NOT_HELD(&icmp->icmp_recv_lock));
4070 }
4071 
4072 /*
4073  * return SNMP stuff in buffer in mpdata
4074  */
4075 mblk_t *
4076 icmp_snmp_get(queue_t *q, mblk_t *mpctl)
4077 {
4078 	mblk_t			*mpdata;
4079 	struct opthdr		*optp;
4080 	conn_t			*connp = Q_TO_CONN(q);
4081 	icmp_stack_t		*is = connp->conn_netstack->netstack_icmp;
4082 	mblk_t			*mp2ctl;
4083 
4084 	/*
4085 	 * make a copy of the original message
4086 	 */
4087 	mp2ctl = copymsg(mpctl);
4088 
4089 	if (mpctl == NULL ||
4090 	    (mpdata = mpctl->b_cont) == NULL) {
4091 		freemsg(mpctl);
4092 		freemsg(mp2ctl);
4093 		return (0);
4094 	}
4095 
4096 	/* fixed length structure for IPv4 and IPv6 counters */
4097 	optp = (struct opthdr *)&mpctl->b_rptr[sizeof (struct T_optmgmt_ack)];
4098 	optp->level = EXPER_RAWIP;
4099 	optp->name = 0;
4100 	(void) snmp_append_data(mpdata, (char *)&is->is_rawip_mib,
4101 	    sizeof (is->is_rawip_mib));
4102 	optp->len = msgdsize(mpdata);
4103 	qreply(q, mpctl);
4104 
4105 	return (mp2ctl);
4106 }
4107 
4108 /*
4109  * Return 0 if invalid set request, 1 otherwise, including non-rawip requests.
4110  * TODO:  If this ever actually tries to set anything, it needs to be
4111  * to do the appropriate locking.
4112  */
4113 /* ARGSUSED */
4114 int
4115 icmp_snmp_set(queue_t *q, t_scalar_t level, t_scalar_t name,
4116     uchar_t *ptr, int len)
4117 {
4118 	switch (level) {
4119 	case EXPER_RAWIP:
4120 		return (0);
4121 	default:
4122 		return (1);
4123 	}
4124 }
4125 
4126 /* Report for ndd "icmp_status" */
4127 /* ARGSUSED */
4128 static int
4129 icmp_status_report(queue_t *q, mblk_t *mp, caddr_t cp, cred_t *cr)
4130 {
4131 	conn_t  *connp;
4132 	ip_stack_t *ipst;
4133 	char	laddrbuf[INET6_ADDRSTRLEN];
4134 	char	faddrbuf[INET6_ADDRSTRLEN];
4135 	int	i;
4136 
4137 	(void) mi_mpprintf(mp,
4138 	    "RAWIP    " MI_COL_HDRPAD_STR
4139 	/*   01234567[89ABCDEF] */
4140 	    "  src addr        dest addr       state");
4141 	/*   xxx.xxx.xxx.xxx xxx.xxx.xxx.xxx UNBOUND */
4142 
4143 	connp = Q_TO_CONN(q);
4144 	ipst = connp->conn_netstack->netstack_ip;
4145 	for (i = 0; i < CONN_G_HASH_SIZE; i++) {
4146 		connf_t *connfp;
4147 		char	*state;
4148 
4149 		connfp = &ipst->ips_ipcl_globalhash_fanout[i];
4150 		connp = NULL;
4151 
4152 		while ((connp = ipcl_get_next_conn(connfp, connp,
4153 		    IPCL_RAWIPCONN)) != NULL) {
4154 			icmp_t  *icmp;
4155 
4156 			mutex_enter(&(connp)->conn_lock);
4157 			icmp = connp->conn_icmp;
4158 
4159 			if (icmp->icmp_state == TS_UNBND)
4160 				state = "UNBOUND";
4161 			else if (icmp->icmp_state == TS_IDLE)
4162 				state = "IDLE";
4163 			else if (icmp->icmp_state == TS_DATA_XFER)
4164 				state = "CONNECTED";
4165 			else
4166 				state = "UnkState";
4167 
4168 			(void) mi_mpprintf(mp, MI_COL_PTRFMT_STR "%s %s %s",
4169 			    (void *)icmp,
4170 			    inet_ntop(AF_INET6, &icmp->icmp_v6dst.sin6_addr,
4171 			    faddrbuf,
4172 			    sizeof (faddrbuf)),
4173 			    inet_ntop(AF_INET6, &icmp->icmp_v6src, laddrbuf,
4174 			    sizeof (laddrbuf)),
4175 			    state);
4176 			mutex_exit(&(connp)->conn_lock);
4177 		}
4178 	}
4179 	return (0);
4180 }
4181 
4182 /*
4183  * This routine creates a T_UDERROR_IND message and passes it upstream.
4184  * The address and options are copied from the T_UNITDATA_REQ message
4185  * passed in mp.  This message is freed.
4186  */
4187 static void
4188 icmp_ud_err(queue_t *q, mblk_t *mp, t_scalar_t err)
4189 {
4190 	mblk_t	*mp1;
4191 	uchar_t	*rptr = mp->b_rptr;
4192 	struct T_unitdata_req *tudr = (struct T_unitdata_req *)rptr;
4193 
4194 	mp1 = mi_tpi_uderror_ind((char *)&rptr[tudr->DEST_offset],
4195 	    tudr->DEST_length, (char *)&rptr[tudr->OPT_offset],
4196 	    tudr->OPT_length, err);
4197 	if (mp1)
4198 		qreply(q, mp1);
4199 	freemsg(mp);
4200 }
4201 
4202 
4203 static int
4204 rawip_do_unbind(conn_t *connp)
4205 {
4206 	icmp_t *icmp = connp->conn_icmp;
4207 
4208 	rw_enter(&icmp->icmp_rwlock, RW_WRITER);
4209 	/* If a bind has not been done, we can't unbind. */
4210 	if (icmp->icmp_state == TS_UNBND || icmp->icmp_pending_op != -1) {
4211 		rw_exit(&icmp->icmp_rwlock);
4212 		return (-TOUTSTATE);
4213 	}
4214 	icmp->icmp_pending_op = T_UNBIND_REQ;
4215 	rw_exit(&icmp->icmp_rwlock);
4216 
4217 	/*
4218 	 * Call ip to unbind
4219 	 */
4220 
4221 	ip_unbind(connp);
4222 
4223 	/*
4224 	 * Once we're unbound from IP, the pending operation may be cleared
4225 	 * here.
4226 	 */
4227 	rw_enter(&icmp->icmp_rwlock, RW_WRITER);
4228 	V6_SET_ZERO(icmp->icmp_v6src);
4229 	V6_SET_ZERO(icmp->icmp_bound_v6src);
4230 	icmp->icmp_pending_op = -1;
4231 	icmp->icmp_state = TS_UNBND;
4232 	if (icmp->icmp_family == AF_INET6)
4233 		(void) icmp_build_hdrs(icmp);
4234 	rw_exit(&icmp->icmp_rwlock);
4235 	return (0);
4236 }
4237 
4238 /*
4239  * This routine is called by icmp_wput to handle T_UNBIND_REQ messages.
4240  * After some error checking, the message is passed downstream to ip.
4241  */
4242 static void
4243 icmp_tpi_unbind(queue_t *q, mblk_t *mp)
4244 {
4245 	conn_t	*connp = Q_TO_CONN(q);
4246 	int	error;
4247 
4248 	ASSERT(mp->b_cont == NULL);
4249 	error = rawip_do_unbind(connp);
4250 	if (error) {
4251 		if (error < 0) {
4252 			icmp_err_ack(q, mp, -error, 0);
4253 		} else {
4254 			icmp_err_ack(q, mp, 0, error);
4255 		}
4256 		return;
4257 	}
4258 
4259 	/*
4260 	 * Convert mp into a T_OK_ACK
4261 	 */
4262 
4263 	mp = mi_tpi_ok_ack_alloc(mp);
4264 
4265 	/*
4266 	 * should not happen in practice... T_OK_ACK is smaller than the
4267 	 * original message.
4268 	 */
4269 	ASSERT(mp != NULL);
4270 	ASSERT(((struct T_ok_ack *)mp->b_rptr)->PRIM_type == T_OK_ACK);
4271 	qreply(q, mp);
4272 }
4273 
4274 
4275 /*
4276  * Process IPv4 packets that already include an IP header.
4277  * Used when IP_HDRINCL has been set (implicit for IPPROTO_RAW and
4278  * IPPROTO_IGMP).
4279  */
4280 static int
4281 icmp_wput_hdrincl(queue_t *q, conn_t *connp, mblk_t *mp, icmp_t *icmp,
4282     ip4_pkt_t *pktinfop)
4283 {
4284 	icmp_stack_t *is = icmp->icmp_is;
4285 	ipha_t	*ipha;
4286 	int	ip_hdr_length;
4287 	int	tp_hdr_len;
4288 	mblk_t	*mp1;
4289 	uint_t	pkt_len;
4290 	ip_opt_info_t optinfo;
4291 
4292 	optinfo.ip_opt_flags = 0;
4293 	optinfo.ip_opt_ill_index = 0;
4294 	ipha = (ipha_t *)mp->b_rptr;
4295 	ip_hdr_length = IP_SIMPLE_HDR_LENGTH + icmp->icmp_ip_snd_options_len;
4296 	if ((mp->b_wptr - mp->b_rptr) < IP_SIMPLE_HDR_LENGTH) {
4297 		if (!pullupmsg(mp, IP_SIMPLE_HDR_LENGTH)) {
4298 			ASSERT(icmp != NULL);
4299 			BUMP_MIB(&is->is_rawip_mib, rawipOutErrors);
4300 			freemsg(mp);
4301 			return (0);
4302 		}
4303 		ipha = (ipha_t *)mp->b_rptr;
4304 	}
4305 	ipha->ipha_version_and_hdr_length =
4306 	    (IP_VERSION<<4) | (ip_hdr_length>>2);
4307 
4308 	/*
4309 	 * For the socket of SOCK_RAW type, the checksum is provided in the
4310 	 * pre-built packet. We set the ipha_ident field to IP_HDR_INCLUDED to
4311 	 * tell IP that the application has sent a complete IP header and not
4312 	 * to compute the transport checksum nor change the DF flag.
4313 	 */
4314 	ipha->ipha_ident = IP_HDR_INCLUDED;
4315 	ipha->ipha_hdr_checksum = 0;
4316 	ipha->ipha_fragment_offset_and_flags &= htons(IPH_DF);
4317 	/* Insert options if any */
4318 	if (ip_hdr_length > IP_SIMPLE_HDR_LENGTH) {
4319 		/*
4320 		 * Put the IP header plus any transport header that is
4321 		 * checksumed by ip_wput into the first mblk. (ip_wput assumes
4322 		 * that at least the checksum field is in the first mblk.)
4323 		 */
4324 		switch (ipha->ipha_protocol) {
4325 		case IPPROTO_UDP:
4326 			tp_hdr_len = 8;
4327 			break;
4328 		case IPPROTO_TCP:
4329 			tp_hdr_len = 20;
4330 			break;
4331 		default:
4332 			tp_hdr_len = 0;
4333 			break;
4334 		}
4335 		/*
4336 		 * The code below assumes that IP_SIMPLE_HDR_LENGTH plus
4337 		 * tp_hdr_len bytes will be in a single mblk.
4338 		 */
4339 		if ((mp->b_wptr - mp->b_rptr) < (IP_SIMPLE_HDR_LENGTH +
4340 		    tp_hdr_len)) {
4341 			if (!pullupmsg(mp, IP_SIMPLE_HDR_LENGTH +
4342 			    tp_hdr_len)) {
4343 				BUMP_MIB(&is->is_rawip_mib,
4344 				    rawipOutErrors);
4345 				freemsg(mp);
4346 				return (0);
4347 			}
4348 			ipha = (ipha_t *)mp->b_rptr;
4349 		}
4350 
4351 		/*
4352 		 * if the length is larger then the max allowed IP packet,
4353 		 * then send an error and abort the processing.
4354 		 */
4355 		pkt_len = ntohs(ipha->ipha_length)
4356 		    + icmp->icmp_ip_snd_options_len;
4357 		if (pkt_len > IP_MAXPACKET) {
4358 			return (EMSGSIZE);
4359 		}
4360 		if (!(mp1 = allocb(ip_hdr_length + is->is_wroff_extra +
4361 		    tp_hdr_len, BPRI_LO))) {
4362 			return (ENOMEM);
4363 		}
4364 		mp1->b_rptr += is->is_wroff_extra;
4365 		mp1->b_wptr = mp1->b_rptr + ip_hdr_length;
4366 
4367 		ipha->ipha_length = htons((uint16_t)pkt_len);
4368 		bcopy(ipha, mp1->b_rptr, IP_SIMPLE_HDR_LENGTH);
4369 
4370 		/* Copy transport header if any */
4371 		bcopy(&ipha[1], mp1->b_wptr, tp_hdr_len);
4372 		mp1->b_wptr += tp_hdr_len;
4373 
4374 		/* Add options */
4375 		ipha = (ipha_t *)mp1->b_rptr;
4376 		bcopy(icmp->icmp_ip_snd_options, &ipha[1],
4377 		    icmp->icmp_ip_snd_options_len);
4378 
4379 		/* Drop IP header and transport header from original */
4380 		(void) adjmsg(mp, IP_SIMPLE_HDR_LENGTH + tp_hdr_len);
4381 
4382 		mp1->b_cont = mp;
4383 		mp = mp1;
4384 		/*
4385 		 * Massage source route putting first source
4386 		 * route in ipha_dst.
4387 		 */
4388 		(void) ip_massage_options(ipha, is->is_netstack);
4389 	}
4390 
4391 	if (pktinfop != NULL) {
4392 		/*
4393 		 * Over write the source address provided in the header
4394 		 */
4395 		if (pktinfop->ip4_addr != INADDR_ANY) {
4396 			ipha->ipha_src = pktinfop->ip4_addr;
4397 			optinfo.ip_opt_flags = IP_VERIFY_SRC;
4398 		}
4399 
4400 		if (pktinfop->ip4_ill_index != 0) {
4401 			optinfo.ip_opt_ill_index = pktinfop->ip4_ill_index;
4402 		}
4403 	}
4404 
4405 	ip_output_options(connp, mp, q, IP_WPUT, &optinfo);
4406 	return (0);
4407 }
4408 
4409 static int
4410 icmp_update_label(icmp_t *icmp, mblk_t *mp, ipaddr_t dst)
4411 {
4412 	int err;
4413 	uchar_t opt_storage[IP_MAX_OPT_LENGTH];
4414 	icmp_stack_t		*is = icmp->icmp_is;
4415 	conn_t			*connp = icmp->icmp_connp;
4416 	cred_t			*cr;
4417 
4418 	/*
4419 	 * All Solaris components should pass a db_credp
4420 	 * for this message, hence we ASSERT.
4421 	 * On production kernels we return an error to be robust against
4422 	 * random streams modules sitting on top of us.
4423 	 */
4424 	cr = msg_getcred(mp, NULL);
4425 	ASSERT(cr != NULL);
4426 	if (cr == NULL)
4427 		return (EINVAL);
4428 
4429 	err = tsol_compute_label(cr, dst,
4430 	    opt_storage, connp->conn_mac_exempt,
4431 	    is->is_netstack->netstack_ip);
4432 	if (err == 0) {
4433 		err = tsol_update_options(&icmp->icmp_ip_snd_options,
4434 		    &icmp->icmp_ip_snd_options_len, &icmp->icmp_label_len,
4435 		    opt_storage);
4436 	}
4437 	if (err != 0) {
4438 		BUMP_MIB(&is->is_rawip_mib, rawipOutErrors);
4439 		DTRACE_PROBE4(
4440 		    tx__ip__log__drop__updatelabel__icmp,
4441 		    char *, "icmp(1) failed to update options(2) on mp(3)",
4442 		    icmp_t *, icmp, char *, opt_storage, mblk_t *, mp);
4443 		return (err);
4444 	}
4445 	IN6_IPADDR_TO_V4MAPPED(dst, &icmp->icmp_v6lastdst);
4446 	return (0);
4447 }
4448 
4449 /*
4450  * This routine handles all messages passed downstream.  It either
4451  * consumes the message or passes it downstream; it never queues a
4452  * a message.
4453  */
4454 static void
4455 icmp_wput(queue_t *q, mblk_t *mp)
4456 {
4457 	uchar_t	*rptr = mp->b_rptr;
4458 	ipha_t	*ipha;
4459 	mblk_t	*mp1;
4460 #define	tudr ((struct T_unitdata_req *)rptr)
4461 	size_t	ip_len;
4462 	conn_t	*connp = Q_TO_CONN(q);
4463 	icmp_t	*icmp = connp->conn_icmp;
4464 	icmp_stack_t *is = icmp->icmp_is;
4465 	sin6_t	*sin6;
4466 	sin_t	*sin;
4467 	ipaddr_t	v4dst;
4468 	ip4_pkt_t	pktinfo;
4469 	ip4_pkt_t	*pktinfop = &pktinfo;
4470 	ip6_pkt_t	ipp_s;  /* For ancillary data options */
4471 	ip6_pkt_t	*ipp = &ipp_s;
4472 	int error;
4473 
4474 	ipp->ipp_fields = 0;
4475 	ipp->ipp_sticky_ignored = 0;
4476 
4477 	switch (mp->b_datap->db_type) {
4478 	case M_DATA:
4479 		if (icmp->icmp_hdrincl) {
4480 			ASSERT(icmp->icmp_ipversion == IPV4_VERSION);
4481 			ipha = (ipha_t *)mp->b_rptr;
4482 			if (mp->b_wptr - mp->b_rptr < IP_SIMPLE_HDR_LENGTH) {
4483 				if (!pullupmsg(mp, IP_SIMPLE_HDR_LENGTH)) {
4484 					BUMP_MIB(&is->is_rawip_mib,
4485 					    rawipOutErrors);
4486 					freemsg(mp);
4487 					return;
4488 				}
4489 				ipha = (ipha_t *)mp->b_rptr;
4490 			}
4491 			/*
4492 			 * If this connection was used for v6 (inconceivable!)
4493 			 * or if we have a new destination, then it's time to
4494 			 * figure a new label.
4495 			 */
4496 			if (is_system_labeled() &&
4497 			    (!IN6_IS_ADDR_V4MAPPED(&icmp->icmp_v6lastdst) ||
4498 			    V4_PART_OF_V6(icmp->icmp_v6lastdst) !=
4499 			    ipha->ipha_dst)) {
4500 				error = icmp_update_label(icmp, mp,
4501 				    ipha->ipha_dst);
4502 				if (error != 0) {
4503 					icmp_ud_err(q, mp, error);
4504 					return;
4505 				}
4506 			}
4507 			error = icmp_wput_hdrincl(q, connp, mp, icmp, NULL);
4508 			if (error != 0)
4509 				icmp_ud_err(q, mp, error);
4510 			return;
4511 		}
4512 		freemsg(mp);
4513 		return;
4514 	case M_PROTO:
4515 	case M_PCPROTO:
4516 		ip_len = mp->b_wptr - rptr;
4517 		if (ip_len >= sizeof (struct T_unitdata_req)) {
4518 			/* Expedite valid T_UNITDATA_REQ to below the switch */
4519 			if (((union T_primitives *)rptr)->type
4520 			    == T_UNITDATA_REQ)
4521 				break;
4522 		}
4523 		/* FALLTHRU */
4524 	default:
4525 		icmp_wput_other(q, mp);
4526 		return;
4527 	}
4528 
4529 	/* Handle T_UNITDATA_REQ messages here. */
4530 
4531 	mp1 = mp->b_cont;
4532 	if (mp1 == NULL) {
4533 		BUMP_MIB(&is->is_rawip_mib, rawipOutErrors);
4534 		icmp_ud_err(q, mp, EPROTO);
4535 		return;
4536 	}
4537 
4538 	if ((rptr + tudr->DEST_offset + tudr->DEST_length) > mp->b_wptr) {
4539 		BUMP_MIB(&is->is_rawip_mib, rawipOutErrors);
4540 		icmp_ud_err(q, mp, EADDRNOTAVAIL);
4541 		return;
4542 	}
4543 
4544 	switch (icmp->icmp_family) {
4545 	case AF_INET6:
4546 		sin6 = (sin6_t *)&rptr[tudr->DEST_offset];
4547 		if (!OK_32PTR((char *)sin6) ||
4548 		    tudr->DEST_length != sizeof (sin6_t) ||
4549 		    sin6->sin6_family != AF_INET6) {
4550 			BUMP_MIB(&is->is_rawip_mib, rawipOutErrors);
4551 			icmp_ud_err(q, mp, EADDRNOTAVAIL);
4552 			return;
4553 		}
4554 
4555 		/* No support for mapped addresses on raw sockets */
4556 		if (IN6_IS_ADDR_V4MAPPED(&sin6->sin6_addr)) {
4557 			BUMP_MIB(&is->is_rawip_mib, rawipOutErrors);
4558 			icmp_ud_err(q, mp, EADDRNOTAVAIL);
4559 			return;
4560 		}
4561 
4562 		/*
4563 		 * Destination is a native IPv6 address.
4564 		 * Send out an IPv6 format packet.
4565 		 */
4566 		if (tudr->OPT_length != 0) {
4567 			int error;
4568 
4569 			error = 0;
4570 			if (icmp_unitdata_opt_process(q, mp, &error,
4571 			    (void *)ipp) < 0) {
4572 				/* failure */
4573 				BUMP_MIB(&is->is_rawip_mib, rawipOutErrors);
4574 				icmp_ud_err(q, mp, error);
4575 				return;
4576 			}
4577 			ASSERT(error == 0);
4578 		}
4579 
4580 		error = raw_ip_send_data_v6(q, connp, mp1, sin6, ipp);
4581 		goto done;
4582 
4583 	case AF_INET:
4584 		sin = (sin_t *)&rptr[tudr->DEST_offset];
4585 		if (!OK_32PTR((char *)sin) ||
4586 		    tudr->DEST_length != sizeof (sin_t) ||
4587 		    sin->sin_family != AF_INET) {
4588 			BUMP_MIB(&is->is_rawip_mib, rawipOutErrors);
4589 			icmp_ud_err(q, mp, EADDRNOTAVAIL);
4590 			return;
4591 		}
4592 		/* Extract and ipaddr */
4593 		v4dst = sin->sin_addr.s_addr;
4594 		break;
4595 
4596 	default:
4597 		ASSERT(0);
4598 	}
4599 
4600 	pktinfop->ip4_ill_index = 0;
4601 	pktinfop->ip4_addr = INADDR_ANY;
4602 
4603 	/*
4604 	 * If options passed in, feed it for verification and handling
4605 	 */
4606 	if (tudr->OPT_length != 0) {
4607 		int error;
4608 
4609 		error = 0;
4610 		if (icmp_unitdata_opt_process(q, mp, &error,
4611 		    (void *)pktinfop) < 0) {
4612 			/* failure */
4613 			BUMP_MIB(&is->is_rawip_mib, rawipOutErrors);
4614 			icmp_ud_err(q, mp, error);
4615 			return;
4616 		}
4617 		ASSERT(error == 0);
4618 		/*
4619 		 * Note: Success in processing options.
4620 		 * mp option buffer represented by
4621 		 * OPT_length/offset now potentially modified
4622 		 * and contain option setting results
4623 		 */
4624 	}
4625 
4626 	error = raw_ip_send_data_v4(q, connp, mp1, v4dst, pktinfop);
4627 done:
4628 	if (error != 0) {
4629 		icmp_ud_err(q, mp, error);
4630 		return;
4631 	} else {
4632 		mp->b_cont = NULL;
4633 		freeb(mp);
4634 	}
4635 }
4636 
4637 
4638 /* ARGSUSED */
4639 static void
4640 icmp_wput_fallback(queue_t *q, mblk_t *mp)
4641 {
4642 #ifdef DEBUG
4643 	cmn_err(CE_CONT, "icmp_wput_fallback: Message during fallback \n");
4644 #endif
4645 	freemsg(mp);
4646 }
4647 
4648 static int
4649 raw_ip_send_data_v4(queue_t *q, conn_t *connp, mblk_t *mp, ipaddr_t v4dst,
4650     ip4_pkt_t *pktinfop)
4651 {
4652 	ipha_t	*ipha;
4653 	size_t	ip_len;
4654 	icmp_t	*icmp = connp->conn_icmp;
4655 	icmp_stack_t *is = icmp->icmp_is;
4656 	int	ip_hdr_length;
4657 	ip_opt_info_t	optinfo;
4658 
4659 	optinfo.ip_opt_flags = 0;
4660 	optinfo.ip_opt_ill_index = 0;
4661 
4662 	if (icmp->icmp_state == TS_UNBND) {
4663 		/* If a port has not been bound to the stream, fail. */
4664 		BUMP_MIB(&is->is_rawip_mib, rawipOutErrors);
4665 		return (EPROTO);
4666 	}
4667 
4668 	if (v4dst == INADDR_ANY)
4669 		v4dst = htonl(INADDR_LOOPBACK);
4670 
4671 	/* Check if our saved options are valid; update if not */
4672 	if (is_system_labeled() &&
4673 	    (!IN6_IS_ADDR_V4MAPPED(&icmp->icmp_v6lastdst) ||
4674 	    V4_PART_OF_V6(icmp->icmp_v6lastdst) != v4dst)) {
4675 		int error = icmp_update_label(icmp, mp, v4dst);
4676 
4677 		if (error != 0)
4678 			return (error);
4679 	}
4680 
4681 	/* Protocol 255 contains full IP headers */
4682 	if (icmp->icmp_hdrincl)
4683 		return (icmp_wput_hdrincl(q, connp, mp, icmp, pktinfop));
4684 
4685 	/* Add an IP header */
4686 	ip_hdr_length = IP_SIMPLE_HDR_LENGTH + icmp->icmp_ip_snd_options_len;
4687 	ipha = (ipha_t *)&mp->b_rptr[-ip_hdr_length];
4688 	if ((uchar_t *)ipha < mp->b_datap->db_base ||
4689 	    mp->b_datap->db_ref != 1 ||
4690 	    !OK_32PTR(ipha)) {
4691 		mblk_t	*mp1;
4692 		if (!(mp1 = allocb(ip_hdr_length + is->is_wroff_extra,
4693 		    BPRI_LO))) {
4694 			BUMP_MIB(&is->is_rawip_mib, rawipOutErrors);
4695 			return (ENOMEM);
4696 		}
4697 		mp1->b_cont = mp;
4698 		ipha = (ipha_t *)mp1->b_datap->db_lim;
4699 		mp1->b_wptr = (uchar_t *)ipha;
4700 		ipha = (ipha_t *)((uchar_t *)ipha - ip_hdr_length);
4701 		mp = mp1;
4702 	}
4703 #ifdef	_BIG_ENDIAN
4704 	/* Set version, header length, and tos */
4705 	*(uint16_t *)&ipha->ipha_version_and_hdr_length =
4706 	    ((((IP_VERSION << 4) | (ip_hdr_length>>2)) << 8) |
4707 	    icmp->icmp_type_of_service);
4708 	/* Set ttl and protocol */
4709 	*(uint16_t *)&ipha->ipha_ttl = (icmp->icmp_ttl << 8) | icmp->icmp_proto;
4710 #else
4711 	/* Set version, header length, and tos */
4712 	*(uint16_t *)&ipha->ipha_version_and_hdr_length =
4713 	    ((icmp->icmp_type_of_service << 8) |
4714 	    ((IP_VERSION << 4) | (ip_hdr_length>>2)));
4715 	/* Set ttl and protocol */
4716 	*(uint16_t *)&ipha->ipha_ttl = (icmp->icmp_proto << 8) | icmp->icmp_ttl;
4717 #endif
4718 	if (pktinfop->ip4_addr != INADDR_ANY) {
4719 		ipha->ipha_src = pktinfop->ip4_addr;
4720 		optinfo.ip_opt_flags = IP_VERIFY_SRC;
4721 	} else {
4722 
4723 		/*
4724 		 * Copy our address into the packet.  If this is zero,
4725 		 * ip will fill in the real source address.
4726 		 */
4727 		IN6_V4MAPPED_TO_IPADDR(&icmp->icmp_v6src, ipha->ipha_src);
4728 	}
4729 
4730 	ipha->ipha_fragment_offset_and_flags = 0;
4731 
4732 	if (pktinfop->ip4_ill_index != 0) {
4733 		optinfo.ip_opt_ill_index = pktinfop->ip4_ill_index;
4734 	}
4735 
4736 
4737 	/*
4738 	 * For the socket of SOCK_RAW type, the checksum is provided in the
4739 	 * pre-built packet. We set the ipha_ident field to IP_HDR_INCLUDED to
4740 	 * tell IP that the application has sent a complete IP header and not
4741 	 * to compute the transport checksum nor change the DF flag.
4742 	 */
4743 	ipha->ipha_ident = IP_HDR_INCLUDED;
4744 
4745 	/* Finish common formatting of the packet. */
4746 	mp->b_rptr = (uchar_t *)ipha;
4747 
4748 	ip_len = mp->b_wptr - (uchar_t *)ipha;
4749 	if (mp->b_cont != NULL)
4750 		ip_len += msgdsize(mp->b_cont);
4751 
4752 	/*
4753 	 * Set the length into the IP header.
4754 	 * If the length is greater than the maximum allowed by IP,
4755 	 * then free the message and return. Do not try and send it
4756 	 * as this can cause problems in layers below.
4757 	 */
4758 	if (ip_len > IP_MAXPACKET) {
4759 		BUMP_MIB(&is->is_rawip_mib, rawipOutErrors);
4760 		return (EMSGSIZE);
4761 	}
4762 	ipha->ipha_length = htons((uint16_t)ip_len);
4763 	/*
4764 	 * Copy in the destination address request
4765 	 */
4766 	ipha->ipha_dst = v4dst;
4767 
4768 	/*
4769 	 * Set ttl based on IP_MULTICAST_TTL to match IPv6 logic.
4770 	 */
4771 	if (CLASSD(v4dst))
4772 		ipha->ipha_ttl = icmp->icmp_multicast_ttl;
4773 
4774 	/* Copy in options if any */
4775 	if (ip_hdr_length > IP_SIMPLE_HDR_LENGTH) {
4776 		bcopy(icmp->icmp_ip_snd_options,
4777 		    &ipha[1], icmp->icmp_ip_snd_options_len);
4778 		/*
4779 		 * Massage source route putting first source route in ipha_dst.
4780 		 * Ignore the destination in the T_unitdata_req.
4781 		 */
4782 		(void) ip_massage_options(ipha, is->is_netstack);
4783 	}
4784 
4785 	BUMP_MIB(&is->is_rawip_mib, rawipOutDatagrams);
4786 	ip_output_options(connp, mp, q, IP_WPUT, &optinfo);
4787 	return (0);
4788 }
4789 
4790 static int
4791 icmp_update_label_v6(icmp_t *icmp, mblk_t *mp, in6_addr_t *dst)
4792 {
4793 	int err;
4794 	uchar_t opt_storage[TSOL_MAX_IPV6_OPTION];
4795 	icmp_stack_t		*is = icmp->icmp_is;
4796 	conn_t			*connp = icmp->icmp_connp;
4797 	cred_t			*cr;
4798 
4799 	/*
4800 	 * All Solaris components should pass a db_credp
4801 	 * for this message, hence we ASSERT.
4802 	 * On production kernels we return an error to be robust against
4803 	 * random streams modules sitting on top of us.
4804 	 */
4805 	cr = msg_getcred(mp, NULL);
4806 	ASSERT(cr != NULL);
4807 	if (cr == NULL)
4808 		return (EINVAL);
4809 
4810 	err = tsol_compute_label_v6(cr, dst,
4811 	    opt_storage, connp->conn_mac_exempt,
4812 	    is->is_netstack->netstack_ip);
4813 	if (err == 0) {
4814 		err = tsol_update_sticky(&icmp->icmp_sticky_ipp,
4815 		    &icmp->icmp_label_len_v6, opt_storage);
4816 	}
4817 	if (err != 0) {
4818 		BUMP_MIB(&is->is_rawip_mib, rawipOutErrors);
4819 		DTRACE_PROBE4(
4820 		    tx__ip__log__drop__updatelabel__icmp6,
4821 		    char *, "icmp(1) failed to update options(2) on mp(3)",
4822 		    icmp_t *, icmp, char *, opt_storage, mblk_t *, mp);
4823 		return (err);
4824 	}
4825 
4826 	icmp->icmp_v6lastdst = *dst;
4827 	return (0);
4828 }
4829 
4830 /*
4831  * raw_ip_send_data_v6():
4832  * Assumes that icmp_wput did some sanity checking on the destination
4833  * address, but that the label may not yet be correct.
4834  */
4835 static int
4836 raw_ip_send_data_v6(queue_t *q, conn_t *connp, mblk_t *mp, sin6_t *sin6,
4837     ip6_pkt_t *ipp)
4838 {
4839 	ip6_t			*ip6h;
4840 	ip6i_t			*ip6i;	/* mp->b_rptr even if no ip6i_t */
4841 	int			ip_hdr_len = IPV6_HDR_LEN;
4842 	size_t			ip_len;
4843 	icmp_t			*icmp = connp->conn_icmp;
4844 	icmp_stack_t		*is = icmp->icmp_is;
4845 	ip6_pkt_t		*tipp;
4846 	uint32_t		csum = 0;
4847 	uint_t			ignore = 0;
4848 	uint_t			option_exists = 0, is_sticky = 0;
4849 	uint8_t			*cp;
4850 	uint8_t			*nxthdr_ptr;
4851 	in6_addr_t		ip6_dst;
4852 
4853 	/*
4854 	 * If the local address is a mapped address return
4855 	 * an error.
4856 	 * It would be possible to send an IPv6 packet but the
4857 	 * response would never make it back to the application
4858 	 * since it is bound to a mapped address.
4859 	 */
4860 	if (IN6_IS_ADDR_V4MAPPED(&icmp->icmp_v6src)) {
4861 		BUMP_MIB(&is->is_rawip_mib, rawipOutErrors);
4862 		return (EADDRNOTAVAIL);
4863 	}
4864 
4865 	ignore = ipp->ipp_sticky_ignored;
4866 	if (sin6->sin6_scope_id != 0 &&
4867 	    IN6_IS_ADDR_LINKSCOPE(&sin6->sin6_addr)) {
4868 		/*
4869 		 * IPPF_SCOPE_ID is special.  It's neither a sticky
4870 		 * option nor ancillary data.  It needs to be
4871 		 * explicitly set in options_exists.
4872 		 */
4873 		option_exists |= IPPF_SCOPE_ID;
4874 	}
4875 
4876 	/*
4877 	 * Compute the destination address
4878 	 */
4879 	ip6_dst = sin6->sin6_addr;
4880 	if (IN6_IS_ADDR_UNSPECIFIED(&sin6->sin6_addr))
4881 		ip6_dst = ipv6_loopback;
4882 
4883 	/*
4884 	 * If we're not going to the same destination as last time, then
4885 	 * recompute the label required.  This is done in a separate routine to
4886 	 * avoid blowing up our stack here.
4887 	 */
4888 	if (is_system_labeled() &&
4889 	    !IN6_ARE_ADDR_EQUAL(&icmp->icmp_v6lastdst, &ip6_dst)) {
4890 		int error = 0;
4891 
4892 		error = icmp_update_label_v6(icmp, mp, &ip6_dst);
4893 		if (error != 0)
4894 			return (error);
4895 	}
4896 
4897 	/*
4898 	 * If there's a security label here, then we ignore any options the
4899 	 * user may try to set.  We keep the peer's label as a hidden sticky
4900 	 * option.
4901 	 */
4902 	if (icmp->icmp_label_len_v6 > 0) {
4903 		ignore &= ~IPPF_HOPOPTS;
4904 		ipp->ipp_fields &= ~IPPF_HOPOPTS;
4905 	}
4906 
4907 	if ((icmp->icmp_sticky_ipp.ipp_fields == 0) &&
4908 	    (ipp->ipp_fields == 0)) {
4909 		/* No sticky options nor ancillary data. */
4910 		goto no_options;
4911 	}
4912 
4913 	/*
4914 	 * Go through the options figuring out where each is going to
4915 	 * come from and build two masks.  The first mask indicates if
4916 	 * the option exists at all.  The second mask indicates if the
4917 	 * option is sticky or ancillary.
4918 	 */
4919 	if (!(ignore & IPPF_HOPOPTS)) {
4920 		if (ipp->ipp_fields & IPPF_HOPOPTS) {
4921 			option_exists |= IPPF_HOPOPTS;
4922 			ip_hdr_len += ipp->ipp_hopoptslen;
4923 		} else if (icmp->icmp_sticky_ipp.ipp_fields & IPPF_HOPOPTS) {
4924 			option_exists |= IPPF_HOPOPTS;
4925 			is_sticky |= IPPF_HOPOPTS;
4926 			ip_hdr_len += icmp->icmp_sticky_ipp.ipp_hopoptslen;
4927 		}
4928 	}
4929 
4930 	if (!(ignore & IPPF_RTHDR)) {
4931 		if (ipp->ipp_fields & IPPF_RTHDR) {
4932 			option_exists |= IPPF_RTHDR;
4933 			ip_hdr_len += ipp->ipp_rthdrlen;
4934 		} else if (icmp->icmp_sticky_ipp.ipp_fields & IPPF_RTHDR) {
4935 			option_exists |= IPPF_RTHDR;
4936 			is_sticky |= IPPF_RTHDR;
4937 			ip_hdr_len += icmp->icmp_sticky_ipp.ipp_rthdrlen;
4938 		}
4939 	}
4940 
4941 	if (!(ignore & IPPF_RTDSTOPTS) && (option_exists & IPPF_RTHDR)) {
4942 		/*
4943 		 * Need to have a router header to use these.
4944 		 */
4945 		if (ipp->ipp_fields & IPPF_RTDSTOPTS) {
4946 			option_exists |= IPPF_RTDSTOPTS;
4947 			ip_hdr_len += ipp->ipp_rtdstoptslen;
4948 		} else if (icmp->icmp_sticky_ipp.ipp_fields & IPPF_RTDSTOPTS) {
4949 			option_exists |= IPPF_RTDSTOPTS;
4950 			is_sticky |= IPPF_RTDSTOPTS;
4951 			ip_hdr_len +=
4952 			    icmp->icmp_sticky_ipp.ipp_rtdstoptslen;
4953 		}
4954 	}
4955 
4956 	if (!(ignore & IPPF_DSTOPTS)) {
4957 		if (ipp->ipp_fields & IPPF_DSTOPTS) {
4958 			option_exists |= IPPF_DSTOPTS;
4959 			ip_hdr_len += ipp->ipp_dstoptslen;
4960 		} else if (icmp->icmp_sticky_ipp.ipp_fields & IPPF_DSTOPTS) {
4961 			option_exists |= IPPF_DSTOPTS;
4962 			is_sticky |= IPPF_DSTOPTS;
4963 			ip_hdr_len += icmp->icmp_sticky_ipp.ipp_dstoptslen;
4964 		}
4965 	}
4966 
4967 	if (!(ignore & IPPF_IFINDEX)) {
4968 		if (ipp->ipp_fields & IPPF_IFINDEX) {
4969 			option_exists |= IPPF_IFINDEX;
4970 		} else if (icmp->icmp_sticky_ipp.ipp_fields & IPPF_IFINDEX) {
4971 			option_exists |= IPPF_IFINDEX;
4972 			is_sticky |= IPPF_IFINDEX;
4973 		}
4974 	}
4975 
4976 	if (!(ignore & IPPF_ADDR)) {
4977 		if (ipp->ipp_fields & IPPF_ADDR) {
4978 			option_exists |= IPPF_ADDR;
4979 		} else if (icmp->icmp_sticky_ipp.ipp_fields & IPPF_ADDR) {
4980 			option_exists |= IPPF_ADDR;
4981 			is_sticky |= IPPF_ADDR;
4982 		}
4983 	}
4984 
4985 	if (!(ignore & IPPF_DONTFRAG)) {
4986 		if (ipp->ipp_fields & IPPF_DONTFRAG) {
4987 			option_exists |= IPPF_DONTFRAG;
4988 		} else if (icmp->icmp_sticky_ipp.ipp_fields & IPPF_DONTFRAG) {
4989 			option_exists |= IPPF_DONTFRAG;
4990 			is_sticky |= IPPF_DONTFRAG;
4991 		}
4992 	}
4993 
4994 	if (!(ignore & IPPF_USE_MIN_MTU)) {
4995 		if (ipp->ipp_fields & IPPF_USE_MIN_MTU) {
4996 			option_exists |= IPPF_USE_MIN_MTU;
4997 		} else if (icmp->icmp_sticky_ipp.ipp_fields &
4998 		    IPPF_USE_MIN_MTU) {
4999 			option_exists |= IPPF_USE_MIN_MTU;
5000 			is_sticky |= IPPF_USE_MIN_MTU;
5001 		}
5002 	}
5003 
5004 	if (!(ignore & IPPF_NEXTHOP)) {
5005 		if (ipp->ipp_fields & IPPF_NEXTHOP) {
5006 			option_exists |= IPPF_NEXTHOP;
5007 		} else if (icmp->icmp_sticky_ipp.ipp_fields & IPPF_NEXTHOP) {
5008 			option_exists |= IPPF_NEXTHOP;
5009 			is_sticky |= IPPF_NEXTHOP;
5010 		}
5011 	}
5012 
5013 	if (!(ignore & IPPF_HOPLIMIT) && (ipp->ipp_fields & IPPF_HOPLIMIT))
5014 		option_exists |= IPPF_HOPLIMIT;
5015 	/* IPV6_HOPLIMIT can never be sticky */
5016 	ASSERT(!(icmp->icmp_sticky_ipp.ipp_fields & IPPF_HOPLIMIT));
5017 
5018 	if (!(ignore & IPPF_UNICAST_HOPS) &&
5019 	    (icmp->icmp_sticky_ipp.ipp_fields & IPPF_UNICAST_HOPS)) {
5020 		option_exists |= IPPF_UNICAST_HOPS;
5021 		is_sticky |= IPPF_UNICAST_HOPS;
5022 	}
5023 
5024 	if (!(ignore & IPPF_MULTICAST_HOPS) &&
5025 	    (icmp->icmp_sticky_ipp.ipp_fields & IPPF_MULTICAST_HOPS)) {
5026 		option_exists |= IPPF_MULTICAST_HOPS;
5027 		is_sticky |= IPPF_MULTICAST_HOPS;
5028 	}
5029 
5030 	if (icmp->icmp_sticky_ipp.ipp_fields & IPPF_NO_CKSUM) {
5031 		/* This is a sticky socket option only */
5032 		option_exists |= IPPF_NO_CKSUM;
5033 		is_sticky |= IPPF_NO_CKSUM;
5034 	}
5035 
5036 	if (icmp->icmp_sticky_ipp.ipp_fields & IPPF_RAW_CKSUM) {
5037 		/* This is a sticky socket option only */
5038 		option_exists |= IPPF_RAW_CKSUM;
5039 		is_sticky |= IPPF_RAW_CKSUM;
5040 	}
5041 
5042 	if (!(ignore & IPPF_TCLASS)) {
5043 		if (ipp->ipp_fields & IPPF_TCLASS) {
5044 			option_exists |= IPPF_TCLASS;
5045 		} else if (icmp->icmp_sticky_ipp.ipp_fields & IPPF_TCLASS) {
5046 			option_exists |= IPPF_TCLASS;
5047 			is_sticky |= IPPF_TCLASS;
5048 		}
5049 	}
5050 
5051 no_options:
5052 
5053 	/*
5054 	 * If any options carried in the ip6i_t were specified, we
5055 	 * need to account for the ip6i_t in the data we'll be sending
5056 	 * down.
5057 	 */
5058 	if (option_exists & IPPF_HAS_IP6I)
5059 		ip_hdr_len += sizeof (ip6i_t);
5060 
5061 	/* check/fix buffer config, setup pointers into it */
5062 	ip6h = (ip6_t *)&mp->b_rptr[-ip_hdr_len];
5063 	if ((mp->b_datap->db_ref != 1) ||
5064 	    ((unsigned char *)ip6h < mp->b_datap->db_base) ||
5065 	    !OK_32PTR(ip6h)) {
5066 		mblk_t	*mp1;
5067 
5068 		/* Try to get everything in a single mblk next time */
5069 		if (ip_hdr_len > icmp->icmp_max_hdr_len) {
5070 			icmp->icmp_max_hdr_len = ip_hdr_len;
5071 
5072 			(void) proto_set_tx_wroff(q == NULL ? NULL:RD(q), connp,
5073 			    icmp->icmp_max_hdr_len + is->is_wroff_extra);
5074 		}
5075 		mp1 = allocb(ip_hdr_len + is->is_wroff_extra, BPRI_LO);
5076 		if (!mp1) {
5077 			BUMP_MIB(&is->is_rawip_mib, rawipOutErrors);
5078 			return (ENOMEM);
5079 		}
5080 		mp1->b_cont = mp;
5081 		mp1->b_wptr = mp1->b_datap->db_lim;
5082 		ip6h = (ip6_t *)(mp1->b_wptr - ip_hdr_len);
5083 		mp = mp1;
5084 	}
5085 	mp->b_rptr = (unsigned char *)ip6h;
5086 	ip6i = (ip6i_t *)ip6h;
5087 
5088 #define	ANCIL_OR_STICKY_PTR(f) ((is_sticky & f) ? &icmp->icmp_sticky_ipp : ipp)
5089 	if (option_exists & IPPF_HAS_IP6I) {
5090 		ip6h = (ip6_t *)&ip6i[1];
5091 		ip6i->ip6i_flags = 0;
5092 		ip6i->ip6i_vcf = IPV6_DEFAULT_VERS_AND_FLOW;
5093 
5094 		/* sin6_scope_id takes precendence over IPPF_IFINDEX */
5095 		if (option_exists & IPPF_SCOPE_ID) {
5096 			ip6i->ip6i_flags |= IP6I_IFINDEX;
5097 			ip6i->ip6i_ifindex = sin6->sin6_scope_id;
5098 		} else if (option_exists & IPPF_IFINDEX) {
5099 			tipp = ANCIL_OR_STICKY_PTR(IPPF_IFINDEX);
5100 			ASSERT(tipp->ipp_ifindex != 0);
5101 			ip6i->ip6i_flags |= IP6I_IFINDEX;
5102 			ip6i->ip6i_ifindex = tipp->ipp_ifindex;
5103 		}
5104 
5105 		if (option_exists & IPPF_RAW_CKSUM) {
5106 			ip6i->ip6i_flags |= IP6I_RAW_CHECKSUM;
5107 			ip6i->ip6i_checksum_off = icmp->icmp_checksum_off;
5108 		}
5109 
5110 		if (option_exists & IPPF_NO_CKSUM) {
5111 			ip6i->ip6i_flags |= IP6I_NO_ULP_CKSUM;
5112 		}
5113 
5114 		if (option_exists & IPPF_ADDR) {
5115 			/*
5116 			 * Enable per-packet source address verification if
5117 			 * IPV6_PKTINFO specified the source address.
5118 			 * ip6_src is set in the transport's _wput function.
5119 			 */
5120 			ip6i->ip6i_flags |= IP6I_VERIFY_SRC;
5121 		}
5122 
5123 		if (option_exists & IPPF_DONTFRAG) {
5124 			ip6i->ip6i_flags |= IP6I_DONTFRAG;
5125 		}
5126 
5127 		if (option_exists & IPPF_USE_MIN_MTU) {
5128 			ip6i->ip6i_flags = IP6I_API_USE_MIN_MTU(
5129 			    ip6i->ip6i_flags, ipp->ipp_use_min_mtu);
5130 		}
5131 
5132 		if (option_exists & IPPF_NEXTHOP) {
5133 			tipp = ANCIL_OR_STICKY_PTR(IPPF_NEXTHOP);
5134 			ASSERT(!IN6_IS_ADDR_UNSPECIFIED(&tipp->ipp_nexthop));
5135 			ip6i->ip6i_flags |= IP6I_NEXTHOP;
5136 			ip6i->ip6i_nexthop = tipp->ipp_nexthop;
5137 		}
5138 
5139 		/*
5140 		 * tell IP this is an ip6i_t private header
5141 		 */
5142 		ip6i->ip6i_nxt = IPPROTO_RAW;
5143 	}
5144 
5145 	/* Initialize IPv6 header */
5146 	ip6h->ip6_vcf = IPV6_DEFAULT_VERS_AND_FLOW;
5147 	bzero(&ip6h->ip6_src, sizeof (ip6h->ip6_src));
5148 
5149 	/* Set the hoplimit of the outgoing packet. */
5150 	if (option_exists & IPPF_HOPLIMIT) {
5151 		/* IPV6_HOPLIMIT ancillary data overrides all other settings. */
5152 		ip6h->ip6_hops = ipp->ipp_hoplimit;
5153 		ip6i->ip6i_flags |= IP6I_HOPLIMIT;
5154 	} else if (IN6_IS_ADDR_MULTICAST(&sin6->sin6_addr)) {
5155 		ip6h->ip6_hops = icmp->icmp_multicast_ttl;
5156 		if (option_exists & IPPF_MULTICAST_HOPS)
5157 			ip6i->ip6i_flags |= IP6I_HOPLIMIT;
5158 	} else {
5159 		ip6h->ip6_hops = icmp->icmp_ttl;
5160 		if (option_exists & IPPF_UNICAST_HOPS)
5161 			ip6i->ip6i_flags |= IP6I_HOPLIMIT;
5162 	}
5163 
5164 	if (option_exists & IPPF_ADDR) {
5165 		tipp = ANCIL_OR_STICKY_PTR(IPPF_ADDR);
5166 		ASSERT(!IN6_IS_ADDR_UNSPECIFIED(&tipp->ipp_addr));
5167 		ip6h->ip6_src = tipp->ipp_addr;
5168 	} else {
5169 		/*
5170 		 * The source address was not set using IPV6_PKTINFO.
5171 		 * First look at the bound source.
5172 		 * If unspecified fallback to __sin6_src_id.
5173 		 */
5174 		ip6h->ip6_src = icmp->icmp_v6src;
5175 		if (sin6->__sin6_src_id != 0 &&
5176 		    IN6_IS_ADDR_UNSPECIFIED(&ip6h->ip6_src)) {
5177 			ip_srcid_find_id(sin6->__sin6_src_id,
5178 			    &ip6h->ip6_src, icmp->icmp_zoneid,
5179 			    is->is_netstack);
5180 		}
5181 	}
5182 
5183 	nxthdr_ptr = (uint8_t *)&ip6h->ip6_nxt;
5184 	cp = (uint8_t *)&ip6h[1];
5185 
5186 	/*
5187 	 * Here's where we have to start stringing together
5188 	 * any extension headers in the right order:
5189 	 * Hop-by-hop, destination, routing, and final destination opts.
5190 	 */
5191 	if (option_exists & IPPF_HOPOPTS) {
5192 		/* Hop-by-hop options */
5193 		ip6_hbh_t *hbh = (ip6_hbh_t *)cp;
5194 		tipp = ANCIL_OR_STICKY_PTR(IPPF_HOPOPTS);
5195 
5196 		*nxthdr_ptr = IPPROTO_HOPOPTS;
5197 		nxthdr_ptr = &hbh->ip6h_nxt;
5198 
5199 		bcopy(tipp->ipp_hopopts, cp, tipp->ipp_hopoptslen);
5200 		cp += tipp->ipp_hopoptslen;
5201 	}
5202 	/*
5203 	 * En-route destination options
5204 	 * Only do them if there's a routing header as well
5205 	 */
5206 	if (option_exists & IPPF_RTDSTOPTS) {
5207 		ip6_dest_t *dst = (ip6_dest_t *)cp;
5208 		tipp = ANCIL_OR_STICKY_PTR(IPPF_RTDSTOPTS);
5209 
5210 		*nxthdr_ptr = IPPROTO_DSTOPTS;
5211 		nxthdr_ptr = &dst->ip6d_nxt;
5212 
5213 		bcopy(tipp->ipp_rtdstopts, cp, tipp->ipp_rtdstoptslen);
5214 		cp += tipp->ipp_rtdstoptslen;
5215 	}
5216 	/*
5217 	 * Routing header next
5218 	 */
5219 	if (option_exists & IPPF_RTHDR) {
5220 		ip6_rthdr_t *rt = (ip6_rthdr_t *)cp;
5221 		tipp = ANCIL_OR_STICKY_PTR(IPPF_RTHDR);
5222 
5223 		*nxthdr_ptr = IPPROTO_ROUTING;
5224 		nxthdr_ptr = &rt->ip6r_nxt;
5225 
5226 		bcopy(tipp->ipp_rthdr, cp, tipp->ipp_rthdrlen);
5227 		cp += tipp->ipp_rthdrlen;
5228 	}
5229 	/*
5230 	 * Do ultimate destination options
5231 	 */
5232 	if (option_exists & IPPF_DSTOPTS) {
5233 		ip6_dest_t *dest = (ip6_dest_t *)cp;
5234 		tipp = ANCIL_OR_STICKY_PTR(IPPF_DSTOPTS);
5235 
5236 		*nxthdr_ptr = IPPROTO_DSTOPTS;
5237 		nxthdr_ptr = &dest->ip6d_nxt;
5238 
5239 		bcopy(tipp->ipp_dstopts, cp, tipp->ipp_dstoptslen);
5240 		cp += tipp->ipp_dstoptslen;
5241 	}
5242 
5243 	/*
5244 	 * Now set the last header pointer to the proto passed in
5245 	 */
5246 	ASSERT((int)(cp - (uint8_t *)ip6i) == ip_hdr_len);
5247 	*nxthdr_ptr = icmp->icmp_proto;
5248 
5249 	/*
5250 	 * Copy in the destination address
5251 	 */
5252 	ip6h->ip6_dst = ip6_dst;
5253 
5254 	ip6h->ip6_vcf =
5255 	    (IPV6_DEFAULT_VERS_AND_FLOW & IPV6_VERS_AND_FLOW_MASK) |
5256 	    (sin6->sin6_flowinfo & ~IPV6_VERS_AND_FLOW_MASK);
5257 
5258 	if (option_exists & IPPF_TCLASS) {
5259 		tipp = ANCIL_OR_STICKY_PTR(IPPF_TCLASS);
5260 		ip6h->ip6_vcf = IPV6_TCLASS_FLOW(ip6h->ip6_vcf,
5261 		    tipp->ipp_tclass);
5262 	}
5263 	if (option_exists & IPPF_RTHDR) {
5264 		ip6_rthdr_t	*rth;
5265 
5266 		/*
5267 		 * Perform any processing needed for source routing.
5268 		 * We know that all extension headers will be in the same mblk
5269 		 * as the IPv6 header.
5270 		 */
5271 		rth = ip_find_rthdr_v6(ip6h, mp->b_wptr);
5272 		if (rth != NULL && rth->ip6r_segleft != 0) {
5273 			if (rth->ip6r_type != IPV6_RTHDR_TYPE_0) {
5274 				/*
5275 				 * Drop packet - only support Type 0 routing.
5276 				 * Notify the application as well.
5277 				 */
5278 				BUMP_MIB(&is->is_rawip_mib,
5279 				    rawipOutErrors);
5280 				return (EPROTO);
5281 			}
5282 			/*
5283 			 * rth->ip6r_len is twice the number of
5284 			 * addresses in the header
5285 			 */
5286 			if (rth->ip6r_len & 0x1) {
5287 				BUMP_MIB(&is->is_rawip_mib,
5288 				    rawipOutErrors);
5289 				return (EPROTO);
5290 			}
5291 			/*
5292 			 * Shuffle the routing header and ip6_dst
5293 			 * addresses, and get the checksum difference
5294 			 * between the first hop (in ip6_dst) and
5295 			 * the destination (in the last routing hdr entry).
5296 			 */
5297 			csum = ip_massage_options_v6(ip6h, rth,
5298 			    is->is_netstack);
5299 			/*
5300 			 * Verify that the first hop isn't a mapped address.
5301 			 * Routers along the path need to do this verification
5302 			 * for subsequent hops.
5303 			 */
5304 			if (IN6_IS_ADDR_V4MAPPED(&ip6h->ip6_dst)) {
5305 				BUMP_MIB(&is->is_rawip_mib,
5306 				    rawipOutErrors);
5307 				return (EADDRNOTAVAIL);
5308 			}
5309 		}
5310 	}
5311 
5312 	ip_len = mp->b_wptr - (uchar_t *)ip6h - IPV6_HDR_LEN;
5313 	if (mp->b_cont != NULL)
5314 		ip_len += msgdsize(mp->b_cont);
5315 
5316 	/*
5317 	 * Set the length into the IP header.
5318 	 * If the length is greater than the maximum allowed by IP,
5319 	 * then free the message and return. Do not try and send it
5320 	 * as this can cause problems in layers below.
5321 	 */
5322 	if (ip_len > IP_MAXPACKET) {
5323 		BUMP_MIB(&is->is_rawip_mib, rawipOutErrors);
5324 		return (EMSGSIZE);
5325 	}
5326 	if (icmp->icmp_proto == IPPROTO_ICMPV6 || icmp->icmp_raw_checksum) {
5327 		uint_t	cksum_off;	/* From ip6i == mp->b_rptr */
5328 		uint16_t *cksum_ptr;
5329 		uint_t	ext_hdrs_len;
5330 
5331 		/* ICMPv6 must have an offset matching icmp6_cksum offset */
5332 		ASSERT(icmp->icmp_proto != IPPROTO_ICMPV6 ||
5333 		    icmp->icmp_checksum_off == 2);
5334 
5335 		/*
5336 		 * We make it easy for IP to include our pseudo header
5337 		 * by putting our length in uh_checksum, modified (if
5338 		 * we have a routing header) by the checksum difference
5339 		 * between the ultimate destination and first hop addresses.
5340 		 * Note: ICMPv6 must always checksum the packet.
5341 		 */
5342 		cksum_off = ip_hdr_len + icmp->icmp_checksum_off;
5343 		if (cksum_off + sizeof (uint16_t) > mp->b_wptr - mp->b_rptr) {
5344 			if (!pullupmsg(mp, cksum_off + sizeof (uint16_t))) {
5345 				BUMP_MIB(&is->is_rawip_mib,
5346 				    rawipOutErrors);
5347 				freemsg(mp);
5348 				return (0);
5349 			}
5350 			ip6i = (ip6i_t *)mp->b_rptr;
5351 			if (ip6i->ip6i_nxt == IPPROTO_RAW)
5352 				ip6h = (ip6_t *)&ip6i[1];
5353 			else
5354 				ip6h = (ip6_t *)ip6i;
5355 		}
5356 		/* Add payload length to checksum */
5357 		ext_hdrs_len = ip_hdr_len - IPV6_HDR_LEN -
5358 		    (int)((uchar_t *)ip6h - (uchar_t *)ip6i);
5359 		csum += htons(ip_len - ext_hdrs_len);
5360 
5361 		cksum_ptr = (uint16_t *)((uchar_t *)ip6i + cksum_off);
5362 		csum = (csum & 0xFFFF) + (csum >> 16);
5363 		*cksum_ptr = (uint16_t)csum;
5364 	}
5365 
5366 #ifdef _LITTLE_ENDIAN
5367 	ip_len = htons(ip_len);
5368 #endif
5369 	ip6h->ip6_plen = (uint16_t)ip_len;
5370 
5371 	/* We're done. Pass the packet to IP */
5372 	BUMP_MIB(&is->is_rawip_mib, rawipOutDatagrams);
5373 	ip_output_v6(icmp->icmp_connp, mp, q, IP_WPUT);
5374 	return (0);
5375 }
5376 
5377 static void
5378 icmp_wput_other(queue_t *q, mblk_t *mp)
5379 {
5380 	uchar_t	*rptr = mp->b_rptr;
5381 	struct iocblk *iocp;
5382 #define	tudr ((struct T_unitdata_req *)rptr)
5383 	conn_t	*connp = Q_TO_CONN(q);
5384 	icmp_t	*icmp = connp->conn_icmp;
5385 	icmp_stack_t *is = icmp->icmp_is;
5386 	cred_t *cr;
5387 
5388 	switch (mp->b_datap->db_type) {
5389 	case M_PROTO:
5390 	case M_PCPROTO:
5391 		if (mp->b_wptr - rptr < sizeof (t_scalar_t)) {
5392 			/*
5393 			 * If the message does not contain a PRIM_type,
5394 			 * throw it away.
5395 			 */
5396 			freemsg(mp);
5397 			return;
5398 		}
5399 		switch (((union T_primitives *)rptr)->type) {
5400 		case T_ADDR_REQ:
5401 			icmp_addr_req(q, mp);
5402 			return;
5403 		case O_T_BIND_REQ:
5404 		case T_BIND_REQ:
5405 			icmp_tpi_bind(q, mp);
5406 			return;
5407 		case T_CONN_REQ:
5408 			icmp_tpi_connect(q, mp);
5409 			return;
5410 		case T_CAPABILITY_REQ:
5411 			icmp_capability_req(q, mp);
5412 			return;
5413 		case T_INFO_REQ:
5414 			icmp_info_req(q, mp);
5415 			return;
5416 		case T_UNITDATA_REQ:
5417 			/*
5418 			 * If a T_UNITDATA_REQ gets here, the address must
5419 			 * be bad.  Valid T_UNITDATA_REQs are found above
5420 			 * and break to below this switch.
5421 			 */
5422 			icmp_ud_err(q, mp, EADDRNOTAVAIL);
5423 			return;
5424 		case T_UNBIND_REQ:
5425 			icmp_tpi_unbind(q, mp);
5426 			return;
5427 
5428 		case T_SVR4_OPTMGMT_REQ:
5429 			/*
5430 			 * All Solaris components should pass a db_credp
5431 			 * for this TPI message, hence we ASSERT.
5432 			 * But in case there is some other M_PROTO that looks
5433 			 * like a TPI message sent by some other kernel
5434 			 * component, we check and return an error.
5435 			 */
5436 			cr = msg_getcred(mp, NULL);
5437 			ASSERT(cr != NULL);
5438 			if (cr == NULL) {
5439 				icmp_err_ack(q, mp, TSYSERR, EINVAL);
5440 				return;
5441 			}
5442 
5443 			if (!snmpcom_req(q, mp, icmp_snmp_set, ip_snmp_get,
5444 			    cr)) {
5445 				/* Only IP can return anything meaningful */
5446 				(void) svr4_optcom_req(q, mp, cr,
5447 				    &icmp_opt_obj, B_TRUE);
5448 			}
5449 			return;
5450 
5451 		case T_OPTMGMT_REQ:
5452 			/*
5453 			 * All Solaris components should pass a db_credp
5454 			 * for this TPI message, hence we ASSERT.
5455 			 * But in case there is some other M_PROTO that looks
5456 			 * like a TPI message sent by some other kernel
5457 			 * component, we check and return an error.
5458 			 */
5459 			cr = msg_getcred(mp, NULL);
5460 			ASSERT(cr != NULL);
5461 			if (cr == NULL) {
5462 				icmp_err_ack(q, mp, TSYSERR, EINVAL);
5463 				return;
5464 			}
5465 			/* Only IP can return anything meaningful */
5466 			(void) tpi_optcom_req(q, mp, cr, &icmp_opt_obj, B_TRUE);
5467 			return;
5468 
5469 		case T_DISCON_REQ:
5470 			icmp_tpi_disconnect(q, mp);
5471 			return;
5472 
5473 		/* The following TPI message is not supported by icmp. */
5474 		case O_T_CONN_RES:
5475 		case T_CONN_RES:
5476 			icmp_err_ack(q, mp, TNOTSUPPORT, 0);
5477 			return;
5478 
5479 		/* The following 3 TPI requests are illegal for icmp. */
5480 		case T_DATA_REQ:
5481 		case T_EXDATA_REQ:
5482 		case T_ORDREL_REQ:
5483 			freemsg(mp);
5484 			(void) putctl1(RD(q), M_ERROR, EPROTO);
5485 			return;
5486 		default:
5487 			break;
5488 		}
5489 		break;
5490 	case M_IOCTL:
5491 		iocp = (struct iocblk *)mp->b_rptr;
5492 		switch (iocp->ioc_cmd) {
5493 		case TI_GETPEERNAME:
5494 			if (icmp->icmp_state != TS_DATA_XFER) {
5495 				/*
5496 				 * If a default destination address has not
5497 				 * been associated with the stream, then we
5498 				 * don't know the peer's name.
5499 				 */
5500 				iocp->ioc_error = ENOTCONN;
5501 		err_ret:;
5502 				iocp->ioc_count = 0;
5503 				mp->b_datap->db_type = M_IOCACK;
5504 				qreply(q, mp);
5505 				return;
5506 			}
5507 			/* FALLTHRU */
5508 		case TI_GETMYNAME:
5509 			/*
5510 			 * For TI_GETPEERNAME and TI_GETMYNAME, we first
5511 			 * need to copyin the user's strbuf structure.
5512 			 * Processing will continue in the M_IOCDATA case
5513 			 * below.
5514 			 */
5515 			mi_copyin(q, mp, NULL,
5516 			    SIZEOF_STRUCT(strbuf, iocp->ioc_flag));
5517 			return;
5518 		case ND_SET:
5519 			/* nd_getset performs the necessary error checking */
5520 		case ND_GET:
5521 			if (nd_getset(q, is->is_nd, mp)) {
5522 				qreply(q, mp);
5523 				return;
5524 			}
5525 			break;
5526 		case _SIOCSOCKFALLBACK:
5527 			/*
5528 			 * socket is falling back to be a
5529 			 * streams socket. Nothing  to do
5530 			 */
5531 			iocp->ioc_count = 0;
5532 			iocp->ioc_rval = 0;
5533 			qreply(q, mp);
5534 			return;
5535 		default:
5536 			break;
5537 		}
5538 		break;
5539 	case M_IOCDATA:
5540 		icmp_wput_iocdata(q, mp);
5541 		return;
5542 	default:
5543 		break;
5544 	}
5545 	ip_wput(q, mp);
5546 }
5547 
5548 /*
5549  * icmp_wput_iocdata is called by icmp_wput_slow to handle all M_IOCDATA
5550  * messages.
5551  */
5552 static void
5553 icmp_wput_iocdata(queue_t *q, mblk_t *mp)
5554 {
5555 	mblk_t	*mp1;
5556 	STRUCT_HANDLE(strbuf, sb);
5557 	icmp_t	*icmp;
5558 	uint_t	addrlen;
5559 	uint_t	error;
5560 
5561 	/* Make sure it is one of ours. */
5562 	switch (((struct iocblk *)mp->b_rptr)->ioc_cmd) {
5563 	case TI_GETMYNAME:
5564 	case TI_GETPEERNAME:
5565 		break;
5566 	default:
5567 		icmp = Q_TO_ICMP(q);
5568 		ip_output(icmp->icmp_connp, mp, q, IP_WPUT);
5569 		return;
5570 	}
5571 	switch (mi_copy_state(q, mp, &mp1)) {
5572 	case -1:
5573 		return;
5574 	case MI_COPY_CASE(MI_COPY_IN, 1):
5575 		break;
5576 	case MI_COPY_CASE(MI_COPY_OUT, 1):
5577 		/*
5578 		 * The address has been copied out, so now
5579 		 * copyout the strbuf.
5580 		 */
5581 		mi_copyout(q, mp);
5582 		return;
5583 	case MI_COPY_CASE(MI_COPY_OUT, 2):
5584 		/*
5585 		 * The address and strbuf have been copied out.
5586 		 * We're done, so just acknowledge the original
5587 		 * M_IOCTL.
5588 		 */
5589 		mi_copy_done(q, mp, 0);
5590 		return;
5591 	default:
5592 		/*
5593 		 * Something strange has happened, so acknowledge
5594 		 * the original M_IOCTL with an EPROTO error.
5595 		 */
5596 		mi_copy_done(q, mp, EPROTO);
5597 		return;
5598 	}
5599 	/*
5600 	 * Now we have the strbuf structure for TI_GETMYNAME
5601 	 * and TI_GETPEERNAME.  Next we copyout the requested
5602 	 * address and then we'll copyout the strbuf.
5603 	 */
5604 	STRUCT_SET_HANDLE(sb, ((struct iocblk *)mp->b_rptr)->ioc_flag,
5605 	    (void *)mp1->b_rptr);
5606 	icmp = Q_TO_ICMP(q);
5607 	if (icmp->icmp_family == AF_INET)
5608 		addrlen = sizeof (sin_t);
5609 	else
5610 		addrlen = sizeof (sin6_t);
5611 
5612 	if (STRUCT_FGET(sb, maxlen) < addrlen) {
5613 		mi_copy_done(q, mp, EINVAL);
5614 		return;
5615 	}
5616 
5617 	mp1 = mi_copyout_alloc(q, mp, STRUCT_FGETP(sb, buf), addrlen, B_TRUE);
5618 
5619 	if (mp1 == NULL)
5620 		return;
5621 
5622 	rw_enter(&icmp->icmp_rwlock, RW_READER);
5623 	switch (((struct iocblk *)mp->b_rptr)->ioc_cmd) {
5624 	case TI_GETMYNAME:
5625 		error = rawip_do_getsockname(icmp, (void *)mp1->b_rptr,
5626 		    &addrlen);
5627 		break;
5628 	case TI_GETPEERNAME:
5629 		error = rawip_do_getpeername(icmp, (void *)mp1->b_rptr,
5630 		    &addrlen);
5631 		break;
5632 	}
5633 	rw_exit(&icmp->icmp_rwlock);
5634 
5635 	if (error != 0) {
5636 		mi_copy_done(q, mp, error);
5637 	} else {
5638 		mp1->b_wptr += addrlen;
5639 		STRUCT_FSET(sb, len, addrlen);
5640 
5641 		/* Copy out the address */
5642 		mi_copyout(q, mp);
5643 	}
5644 }
5645 
5646 static int
5647 icmp_unitdata_opt_process(queue_t *q, mblk_t *mp, int *errorp,
5648     void *thisdg_attrs)
5649 {
5650 	struct T_unitdata_req *udreqp;
5651 	int is_absreq_failure;
5652 	cred_t *cr;
5653 
5654 	udreqp = (struct T_unitdata_req *)mp->b_rptr;
5655 	*errorp = 0;
5656 
5657 	/*
5658 	 * All Solaris components should pass a db_credp
5659 	 * for this TPI message, hence we ASSERT.
5660 	 * But in case there is some other M_PROTO that looks
5661 	 * like a TPI message sent by some other kernel
5662 	 * component, we check and return an error.
5663 	 */
5664 	cr = msg_getcred(mp, NULL);
5665 	ASSERT(cr != NULL);
5666 	if (cr == NULL)
5667 		return (-1);
5668 
5669 	*errorp = tpi_optcom_buf(q, mp, &udreqp->OPT_length,
5670 	    udreqp->OPT_offset, cr, &icmp_opt_obj,
5671 	    thisdg_attrs, &is_absreq_failure);
5672 
5673 	if (*errorp != 0) {
5674 		/*
5675 		 * Note: No special action needed in this
5676 		 * module for "is_absreq_failure"
5677 		 */
5678 		return (-1);		/* failure */
5679 	}
5680 	ASSERT(is_absreq_failure == 0);
5681 	return (0);	/* success */
5682 }
5683 
5684 void
5685 icmp_ddi_g_init(void)
5686 {
5687 	icmp_max_optsize = optcom_max_optsize(icmp_opt_obj.odb_opt_des_arr,
5688 	    icmp_opt_obj.odb_opt_arr_cnt);
5689 
5690 	/*
5691 	 * We want to be informed each time a stack is created or
5692 	 * destroyed in the kernel, so we can maintain the
5693 	 * set of icmp_stack_t's.
5694 	 */
5695 	netstack_register(NS_ICMP, rawip_stack_init, NULL, rawip_stack_fini);
5696 }
5697 
5698 void
5699 icmp_ddi_g_destroy(void)
5700 {
5701 	netstack_unregister(NS_ICMP);
5702 }
5703 
5704 #define	INET_NAME	"ip"
5705 
5706 /*
5707  * Initialize the ICMP stack instance.
5708  */
5709 static void *
5710 rawip_stack_init(netstackid_t stackid, netstack_t *ns)
5711 {
5712 	icmp_stack_t	*is;
5713 	icmpparam_t	*pa;
5714 	int		error = 0;
5715 	major_t		major;
5716 
5717 	is = (icmp_stack_t *)kmem_zalloc(sizeof (*is), KM_SLEEP);
5718 	is->is_netstack = ns;
5719 
5720 	pa = (icmpparam_t *)kmem_alloc(sizeof (icmp_param_arr), KM_SLEEP);
5721 	is->is_param_arr = pa;
5722 	bcopy(icmp_param_arr, is->is_param_arr, sizeof (icmp_param_arr));
5723 
5724 	(void) icmp_param_register(&is->is_nd,
5725 	    is->is_param_arr, A_CNT(icmp_param_arr));
5726 	is->is_ksp = rawip_kstat_init(stackid);
5727 
5728 	major = mod_name_to_major(INET_NAME);
5729 	error = ldi_ident_from_major(major, &is->is_ldi_ident);
5730 	ASSERT(error == 0);
5731 	return (is);
5732 }
5733 
5734 /*
5735  * Free the ICMP stack instance.
5736  */
5737 static void
5738 rawip_stack_fini(netstackid_t stackid, void *arg)
5739 {
5740 	icmp_stack_t *is = (icmp_stack_t *)arg;
5741 
5742 	nd_free(&is->is_nd);
5743 	kmem_free(is->is_param_arr, sizeof (icmp_param_arr));
5744 	is->is_param_arr = NULL;
5745 
5746 	rawip_kstat_fini(stackid, is->is_ksp);
5747 	is->is_ksp = NULL;
5748 	ldi_ident_release(is->is_ldi_ident);
5749 	kmem_free(is, sizeof (*is));
5750 }
5751 
5752 static void *
5753 rawip_kstat_init(netstackid_t stackid) {
5754 	kstat_t	*ksp;
5755 
5756 	rawip_named_kstat_t template = {
5757 		{ "inDatagrams",	KSTAT_DATA_UINT32, 0 },
5758 		{ "inCksumErrs",	KSTAT_DATA_UINT32, 0 },
5759 		{ "inErrors",		KSTAT_DATA_UINT32, 0 },
5760 		{ "outDatagrams",	KSTAT_DATA_UINT32, 0 },
5761 		{ "outErrors",		KSTAT_DATA_UINT32, 0 },
5762 	};
5763 
5764 	ksp = kstat_create_netstack("icmp", 0, "rawip", "mib2",
5765 					KSTAT_TYPE_NAMED,
5766 					NUM_OF_FIELDS(rawip_named_kstat_t),
5767 					0, stackid);
5768 	if (ksp == NULL || ksp->ks_data == NULL)
5769 		return (NULL);
5770 
5771 	bcopy(&template, ksp->ks_data, sizeof (template));
5772 	ksp->ks_update = rawip_kstat_update;
5773 	ksp->ks_private = (void *)(uintptr_t)stackid;
5774 
5775 	kstat_install(ksp);
5776 	return (ksp);
5777 }
5778 
5779 static void
5780 rawip_kstat_fini(netstackid_t stackid, kstat_t *ksp)
5781 {
5782 	if (ksp != NULL) {
5783 		ASSERT(stackid == (netstackid_t)(uintptr_t)ksp->ks_private);
5784 		kstat_delete_netstack(ksp, stackid);
5785 	}
5786 }
5787 
5788 static int
5789 rawip_kstat_update(kstat_t *ksp, int rw)
5790 {
5791 	rawip_named_kstat_t *rawipkp;
5792 	netstackid_t	stackid = (netstackid_t)(uintptr_t)ksp->ks_private;
5793 	netstack_t	*ns;
5794 	icmp_stack_t	*is;
5795 
5796 	if ((ksp == NULL) || (ksp->ks_data == NULL))
5797 		return (EIO);
5798 
5799 	if (rw == KSTAT_WRITE)
5800 		return (EACCES);
5801 
5802 	rawipkp = (rawip_named_kstat_t *)ksp->ks_data;
5803 
5804 	ns = netstack_find_by_stackid(stackid);
5805 	if (ns == NULL)
5806 		return (-1);
5807 	is = ns->netstack_icmp;
5808 	if (is == NULL) {
5809 		netstack_rele(ns);
5810 		return (-1);
5811 	}
5812 	rawipkp->inDatagrams.value.ui32 =  is->is_rawip_mib.rawipInDatagrams;
5813 	rawipkp->inCksumErrs.value.ui32 =  is->is_rawip_mib.rawipInCksumErrs;
5814 	rawipkp->inErrors.value.ui32 =	   is->is_rawip_mib.rawipInErrors;
5815 	rawipkp->outDatagrams.value.ui32 = is->is_rawip_mib.rawipOutDatagrams;
5816 	rawipkp->outErrors.value.ui32 =	   is->is_rawip_mib.rawipOutErrors;
5817 	netstack_rele(ns);
5818 	return (0);
5819 }
5820 
5821 /* ARGSUSED */
5822 int
5823 rawip_accept(sock_lower_handle_t lproto_handle,
5824     sock_lower_handle_t eproto_handle, sock_upper_handle_t sock_handle,
5825     cred_t *cr)
5826 {
5827 	return (EOPNOTSUPP);
5828 }
5829 
5830 /* ARGSUSED */
5831 int
5832 rawip_bind(sock_lower_handle_t proto_handle, struct sockaddr *sa,
5833     socklen_t len, cred_t *cr)
5834 {
5835 	conn_t  *connp = (conn_t *)proto_handle;
5836 	int error;
5837 
5838 	/* All Solaris components should pass a cred for this operation. */
5839 	ASSERT(cr != NULL);
5840 
5841 	/* Binding to a NULL address really means unbind */
5842 	if (sa == NULL)
5843 		error = rawip_do_unbind(connp);
5844 	else
5845 		error = rawip_do_bind(connp, sa, len);
5846 
5847 	if (error < 0) {
5848 		if (error == -TOUTSTATE)
5849 			error = EINVAL;
5850 		else
5851 			error = proto_tlitosyserr(-error);
5852 	}
5853 	return (error);
5854 }
5855 
5856 static int
5857 rawip_implicit_bind(conn_t *connp)
5858 {
5859 	sin6_t sin6addr;
5860 	sin_t *sin;
5861 	sin6_t *sin6;
5862 	socklen_t len;
5863 	int error;
5864 
5865 	if (connp->conn_icmp->icmp_family == AF_INET) {
5866 		len = sizeof (struct sockaddr_in);
5867 		sin = (sin_t *)&sin6addr;
5868 		*sin = sin_null;
5869 		sin->sin_family = AF_INET;
5870 		sin->sin_addr.s_addr = INADDR_ANY;
5871 	} else {
5872 		ASSERT(connp->conn_icmp->icmp_family == AF_INET6);
5873 		len = sizeof (sin6_t);
5874 		sin6 = (sin6_t *)&sin6addr;
5875 		*sin6 = sin6_null;
5876 		sin6->sin6_family = AF_INET6;
5877 		V6_SET_ZERO(sin6->sin6_addr);
5878 	}
5879 
5880 	error = rawip_do_bind(connp, (struct sockaddr *)&sin6addr, len);
5881 
5882 	return ((error < 0) ? proto_tlitosyserr(-error) : error);
5883 }
5884 
5885 static int
5886 rawip_unbind(conn_t *connp)
5887 {
5888 	int error;
5889 
5890 	error = rawip_do_unbind(connp);
5891 	if (error < 0) {
5892 		error = proto_tlitosyserr(-error);
5893 	}
5894 	return (error);
5895 }
5896 
5897 /* ARGSUSED */
5898 int
5899 rawip_listen(sock_lower_handle_t proto_handle, int backlog, cred_t *cr)
5900 {
5901 	return (EOPNOTSUPP);
5902 }
5903 
5904 /* ARGSUSED */
5905 int
5906 rawip_connect(sock_lower_handle_t proto_handle, const struct sockaddr *sa,
5907     socklen_t len, sock_connid_t *id, cred_t *cr)
5908 {
5909 	conn_t	*connp = (conn_t *)proto_handle;
5910 	icmp_t *icmp = connp->conn_icmp;
5911 	int	error;
5912 	boolean_t did_bind = B_FALSE;
5913 
5914 	/* All Solaris components should pass a cred for this operation. */
5915 	ASSERT(cr != NULL);
5916 
5917 	if (sa == NULL) {
5918 		/*
5919 		 * Disconnect
5920 		 * Make sure we are connected
5921 		 */
5922 		if (icmp->icmp_state != TS_DATA_XFER)
5923 			return (EINVAL);
5924 
5925 		error = icmp_disconnect(connp);
5926 		return (error);
5927 	}
5928 
5929 	error = proto_verify_ip_addr(icmp->icmp_family, sa, len);
5930 	if (error != 0)
5931 		return (error);
5932 
5933 	/* do an implicit bind if necessary */
5934 	if (icmp->icmp_state == TS_UNBND) {
5935 		error = rawip_implicit_bind(connp);
5936 		/*
5937 		 * We could be racing with an actual bind, in which case
5938 		 * we would see EPROTO. We cross our fingers and try
5939 		 * to connect.
5940 		 */
5941 		if (!(error == 0 || error == EPROTO))
5942 			return (error);
5943 		did_bind = B_TRUE;
5944 	}
5945 
5946 	/*
5947 	 * set SO_DGRAM_ERRIND
5948 	 */
5949 	icmp->icmp_dgram_errind = B_TRUE;
5950 
5951 	error = rawip_do_connect(connp, sa, len, cr);
5952 
5953 	if (error != 0 && did_bind) {
5954 		int unbind_err;
5955 
5956 		unbind_err = rawip_unbind(connp);
5957 		ASSERT(unbind_err == 0);
5958 	}
5959 
5960 	if (error == 0) {
5961 		*id = 0;
5962 		(*connp->conn_upcalls->su_connected)
5963 		    (connp->conn_upper_handle, 0, NULL, -1);
5964 	} else if (error < 0) {
5965 		error = proto_tlitosyserr(-error);
5966 	}
5967 	return (error);
5968 }
5969 
5970 /* ARGSUSED */
5971 void
5972 rawip_fallback(sock_lower_handle_t proto_handle, queue_t *q,
5973     boolean_t direct_sockfs, so_proto_quiesced_cb_t quiesced_cb)
5974 {
5975 	conn_t  *connp = (conn_t *)proto_handle;
5976 	icmp_t	*icmp;
5977 	struct T_capability_ack tca;
5978 	struct sockaddr_in6 laddr, faddr;
5979 	socklen_t laddrlen, faddrlen;
5980 	short opts;
5981 	struct stroptions *stropt;
5982 	mblk_t *stropt_mp;
5983 	int error;
5984 
5985 	icmp = connp->conn_icmp;
5986 
5987 	stropt_mp = allocb_wait(sizeof (*stropt), BPRI_HI, STR_NOSIG, NULL);
5988 
5989 	/*
5990 	 * setup the fallback stream that was allocated
5991 	 */
5992 	connp->conn_dev = (dev_t)RD(q)->q_ptr;
5993 	connp->conn_minor_arena = WR(q)->q_ptr;
5994 
5995 	RD(q)->q_ptr = WR(q)->q_ptr = connp;
5996 
5997 	WR(q)->q_qinfo = &icmpwinit;
5998 
5999 	connp->conn_rq = RD(q);
6000 	connp->conn_wq = WR(q);
6001 
6002 	/* Notify stream head about options before sending up data */
6003 	stropt_mp->b_datap->db_type = M_SETOPTS;
6004 	stropt_mp->b_wptr += sizeof (*stropt);
6005 	stropt = (struct stroptions *)stropt_mp->b_rptr;
6006 	stropt->so_flags = SO_WROFF | SO_HIWAT;
6007 	stropt->so_wroff =
6008 	    (ushort_t)(icmp->icmp_max_hdr_len + icmp->icmp_is->is_wroff_extra);
6009 	stropt->so_hiwat = icmp->icmp_recv_hiwat;
6010 	putnext(RD(q), stropt_mp);
6011 
6012 	/*
6013 	 * free helper stream
6014 	 */
6015 	ip_free_helper_stream(connp);
6016 
6017 	/*
6018 	 * Collect the information needed to sync with the sonode
6019 	 */
6020 	icmp_do_capability_ack(icmp, &tca, TC1_INFO);
6021 
6022 	laddrlen = faddrlen = sizeof (sin6_t);
6023 	(void) rawip_getsockname((sock_lower_handle_t)connp,
6024 	    (struct sockaddr *)&laddr, &laddrlen, CRED());
6025 	error = rawip_getpeername((sock_lower_handle_t)connp,
6026 	    (struct sockaddr *)&faddr, &faddrlen, CRED());
6027 	if (error != 0)
6028 		faddrlen = 0;
6029 	opts = 0;
6030 	if (icmp->icmp_dgram_errind)
6031 		opts |= SO_DGRAM_ERRIND;
6032 	if (icmp->icmp_dontroute)
6033 		opts |= SO_DONTROUTE;
6034 
6035 	/*
6036 	 * Once we grab the drain lock, no data will be send up
6037 	 * to the socket. So we notify the socket that the endpoint
6038 	 * is quiescent and it's therefore safe move data from
6039 	 * the socket to the stream head.
6040 	 */
6041 	(*quiesced_cb)(connp->conn_upper_handle, q, &tca,
6042 	    (struct sockaddr *)&laddr, laddrlen,
6043 	    (struct sockaddr *)&faddr, faddrlen, opts);
6044 
6045 	/*
6046 	 * push up any packets that were queued in icmp_t
6047 	 */
6048 
6049 	mutex_enter(&icmp->icmp_recv_lock);
6050 	while (icmp->icmp_fallback_queue_head != NULL) {
6051 		mblk_t	*mp;
6052 
6053 		mp = icmp->icmp_fallback_queue_head;
6054 		icmp->icmp_fallback_queue_head = mp->b_next;
6055 		mp->b_next = NULL;
6056 		mutex_exit(&icmp->icmp_recv_lock);
6057 		putnext(RD(q), mp);
6058 		mutex_enter(&icmp->icmp_recv_lock);
6059 	}
6060 	icmp->icmp_fallback_queue_tail = icmp->icmp_fallback_queue_head;
6061 	/*
6062 	 * No longer a streams less socket
6063 	 */
6064 	connp->conn_flags &= ~IPCL_NONSTR;
6065 	mutex_exit(&icmp->icmp_recv_lock);
6066 	ASSERT(icmp->icmp_fallback_queue_head == NULL &&
6067 	    icmp->icmp_fallback_queue_tail == NULL);
6068 
6069 	ASSERT(connp->conn_ref >= 1);
6070 }
6071 
6072 /* ARGSUSED */
6073 sock_lower_handle_t
6074 rawip_create(int family, int type, int proto, sock_downcalls_t **sock_downcalls,
6075     uint_t *smodep, int *errorp, int flags, cred_t *credp)
6076 {
6077 	conn_t *connp;
6078 
6079 	if (type != SOCK_RAW || (family != AF_INET && family != AF_INET6)) {
6080 		*errorp = EPROTONOSUPPORT;
6081 		return (NULL);
6082 	}
6083 
6084 	connp = icmp_open(family, credp, errorp, flags);
6085 	if (connp != NULL) {
6086 		icmp_stack_t *is;
6087 
6088 		is = connp->conn_icmp->icmp_is;
6089 		connp->conn_flags |= IPCL_NONSTR;
6090 
6091 		if (connp->conn_icmp->icmp_family == AF_INET6) {
6092 			/* Build initial header template for transmit */
6093 			rw_enter(&connp->conn_icmp->icmp_rwlock, RW_WRITER);
6094 			if ((*errorp =
6095 			    icmp_build_hdrs(connp->conn_icmp)) != 0) {
6096 				rw_exit(&connp->conn_icmp->icmp_rwlock);
6097 				ipcl_conn_destroy(connp);
6098 				return (NULL);
6099 			}
6100 			rw_exit(&connp->conn_icmp->icmp_rwlock);
6101 		}
6102 
6103 		connp->conn_icmp->icmp_recv_hiwat = is->is_recv_hiwat;
6104 		connp->conn_icmp->icmp_xmit_hiwat = is->is_xmit_hiwat;
6105 
6106 		if ((*errorp = ip_create_helper_stream(connp,
6107 		    is->is_ldi_ident)) != 0) {
6108 			cmn_err(CE_CONT, "create of IP helper stream failed\n");
6109 			(void) rawip_do_close(connp);
6110 			return (NULL);
6111 		}
6112 
6113 		mutex_enter(&connp->conn_lock);
6114 		connp->conn_state_flags &= ~CONN_INCIPIENT;
6115 		mutex_exit(&connp->conn_lock);
6116 		*sock_downcalls = &sock_rawip_downcalls;
6117 		*smodep = SM_ATOMIC;
6118 	} else {
6119 		ASSERT(*errorp != 0);
6120 	}
6121 
6122 	return ((sock_lower_handle_t)connp);
6123 }
6124 
6125 /* ARGSUSED */
6126 void
6127 rawip_activate(sock_lower_handle_t proto_handle,
6128     sock_upper_handle_t sock_handle, sock_upcalls_t *sock_upcalls, int flags,
6129     cred_t *cr)
6130 {
6131 	conn_t 			*connp = (conn_t *)proto_handle;
6132 	icmp_stack_t 		*is = connp->conn_icmp->icmp_is;
6133 	struct sock_proto_props sopp;
6134 
6135 	/* All Solaris components should pass a cred for this operation. */
6136 	ASSERT(cr != NULL);
6137 
6138 	connp->conn_upcalls = sock_upcalls;
6139 	connp->conn_upper_handle = sock_handle;
6140 
6141 	sopp.sopp_flags = SOCKOPT_WROFF | SOCKOPT_RCVHIWAT | SOCKOPT_RCVLOWAT |
6142 	    SOCKOPT_MAXBLK | SOCKOPT_MAXPSZ | SOCKOPT_MINPSZ;
6143 	sopp.sopp_wroff = connp->conn_icmp->icmp_max_hdr_len +
6144 	    is->is_wroff_extra;
6145 	sopp.sopp_rxhiwat = is->is_recv_hiwat;
6146 	sopp.sopp_rxlowat = icmp_mod_info.mi_lowat;
6147 	sopp.sopp_maxblk = INFPSZ;
6148 	sopp.sopp_maxpsz = IP_MAXPACKET;
6149 	sopp.sopp_minpsz = (icmp_mod_info.mi_minpsz == 1) ? 0 :
6150 	    icmp_mod_info.mi_minpsz;
6151 
6152 	(*connp->conn_upcalls->su_set_proto_props)
6153 	    (connp->conn_upper_handle, &sopp);
6154 }
6155 
6156 static int
6157 rawip_do_getsockname(icmp_t *icmp, struct sockaddr *sa, uint_t *salenp)
6158 {
6159 	sin_t	*sin = (sin_t *)sa;
6160 	sin6_t	*sin6 = (sin6_t *)sa;
6161 
6162 	ASSERT(icmp != NULL);
6163 	ASSERT(RW_LOCK_HELD(&icmp->icmp_rwlock));
6164 
6165 	switch (icmp->icmp_family) {
6166 	case AF_INET:
6167 		ASSERT(icmp->icmp_ipversion == IPV4_VERSION);
6168 		if (*salenp < sizeof (sin_t))
6169 			return (EINVAL);
6170 
6171 		*salenp = sizeof (sin_t);
6172 		*sin = sin_null;
6173 		sin->sin_family = AF_INET;
6174 		if (icmp->icmp_state == TS_UNBND) {
6175 			break;
6176 		}
6177 
6178 		if (!IN6_IS_ADDR_V4MAPPED_ANY(&icmp->icmp_v6src) &&
6179 		    !IN6_IS_ADDR_UNSPECIFIED(&icmp->icmp_v6src)) {
6180 			sin->sin_addr.s_addr = V4_PART_OF_V6(icmp->icmp_v6src);
6181 		} else {
6182 			/*
6183 			 * INADDR_ANY
6184 			 * icmp_v6src is not set, we might be bound to
6185 			 * broadcast/multicast. Use icmp_bound_v6src as
6186 			 * local address instead (that could
6187 			 * also still be INADDR_ANY)
6188 			 */
6189 			sin->sin_addr.s_addr =
6190 			    V4_PART_OF_V6(icmp->icmp_bound_v6src);
6191 		}
6192 		break;
6193 	case AF_INET6:
6194 
6195 		if (*salenp < sizeof (sin6_t))
6196 			return (EINVAL);
6197 
6198 		*salenp = sizeof (sin6_t);
6199 		*sin6 = sin6_null;
6200 		sin6->sin6_family = AF_INET6;
6201 		if (icmp->icmp_state == TS_UNBND) {
6202 			break;
6203 		}
6204 		if (!IN6_IS_ADDR_UNSPECIFIED(&icmp->icmp_v6src)) {
6205 			sin6->sin6_addr = icmp->icmp_v6src;
6206 		} else {
6207 			/*
6208 			 * UNSPECIFIED
6209 			 * icmp_v6src is not set, we might be bound to
6210 			 * broadcast/multicast. Use icmp_bound_v6src as
6211 			 * local address instead (that could
6212 			 * also still be UNSPECIFIED)
6213 			 */
6214 
6215 			sin6->sin6_addr = icmp->icmp_bound_v6src;
6216 		}
6217 		break;
6218 	}
6219 	return (0);
6220 }
6221 
6222 static int
6223 rawip_do_getpeername(icmp_t *icmp, struct sockaddr *sa, uint_t *salenp)
6224 {
6225 	sin_t   *sin = (sin_t *)sa;
6226 	sin6_t  *sin6 = (sin6_t *)sa;
6227 
6228 	ASSERT(icmp != NULL);
6229 	ASSERT(RW_LOCK_HELD(&icmp->icmp_rwlock));
6230 
6231 	if (icmp->icmp_state != TS_DATA_XFER)
6232 		return (ENOTCONN);
6233 
6234 	sa->sa_family = icmp->icmp_family;
6235 	switch (icmp->icmp_family) {
6236 	case AF_INET:
6237 		ASSERT(icmp->icmp_ipversion == IPV4_VERSION);
6238 
6239 		if (*salenp < sizeof (sin_t))
6240 			return (EINVAL);
6241 
6242 		*salenp = sizeof (sin_t);
6243 		*sin = sin_null;
6244 		sin->sin_family = AF_INET;
6245 		sin->sin_addr.s_addr =
6246 		    V4_PART_OF_V6(icmp->icmp_v6dst.sin6_addr);
6247 		break;
6248 	case AF_INET6:
6249 		if (*salenp < sizeof (sin6_t))
6250 			return (EINVAL);
6251 
6252 		*salenp = sizeof (sin6_t);
6253 		*sin6 = sin6_null;
6254 		*sin6 = icmp->icmp_v6dst;
6255 		break;
6256 	}
6257 	return (0);
6258 }
6259 
6260 /* ARGSUSED */
6261 int
6262 rawip_getpeername(sock_lower_handle_t proto_handle, struct sockaddr *sa,
6263     socklen_t *salenp, cred_t *cr)
6264 {
6265 	conn_t  *connp = (conn_t *)proto_handle;
6266 	icmp_t  *icmp = connp->conn_icmp;
6267 	int	error;
6268 
6269 	/* All Solaris components should pass a cred for this operation. */
6270 	ASSERT(cr != NULL);
6271 
6272 	ASSERT(icmp != NULL);
6273 
6274 	rw_enter(&icmp->icmp_rwlock, RW_READER);
6275 
6276 	error = rawip_do_getpeername(icmp, sa, salenp);
6277 
6278 	rw_exit(&icmp->icmp_rwlock);
6279 
6280 	return (error);
6281 }
6282 
6283 /* ARGSUSED */
6284 int
6285 rawip_getsockname(sock_lower_handle_t proto_handle, struct sockaddr *sa,
6286     socklen_t *salenp, cred_t *cr)
6287 {
6288 	conn_t  *connp = (conn_t *)proto_handle;
6289 	icmp_t	*icmp = connp->conn_icmp;
6290 	int	error;
6291 
6292 	/* All Solaris components should pass a cred for this operation. */
6293 	ASSERT(cr != NULL);
6294 
6295 	ASSERT(icmp != NULL);
6296 	rw_enter(&icmp->icmp_rwlock, RW_READER);
6297 
6298 	error = rawip_do_getsockname(icmp, sa, salenp);
6299 
6300 	rw_exit(&icmp->icmp_rwlock);
6301 
6302 	return (error);
6303 }
6304 
6305 int
6306 rawip_setsockopt(sock_lower_handle_t proto_handle, int level, int option_name,
6307     const void *optvalp, socklen_t optlen, cred_t *cr)
6308 {
6309 	conn_t	*connp = (conn_t *)proto_handle;
6310 	icmp_t *icmp = connp->conn_icmp;
6311 	int error;
6312 
6313 	/* All Solaris components should pass a cred for this operation. */
6314 	ASSERT(cr != NULL);
6315 
6316 	error = proto_opt_check(level, option_name, optlen, NULL,
6317 	    icmp_opt_obj.odb_opt_des_arr,
6318 	    icmp_opt_obj.odb_opt_arr_cnt,
6319 	    icmp_opt_obj.odb_topmost_tpiprovider,
6320 	    B_TRUE, B_FALSE, cr);
6321 
6322 	if (error != 0) {
6323 		/*
6324 		 * option not recognized
6325 		 */
6326 		if (error < 0) {
6327 			error = proto_tlitosyserr(-error);
6328 		}
6329 		return (error);
6330 	}
6331 
6332 	rw_enter(&icmp->icmp_rwlock, RW_WRITER);
6333 	error = icmp_opt_set(connp, SETFN_OPTCOM_NEGOTIATE, level,
6334 	    option_name, optlen, (uchar_t *)optvalp, (uint_t *)&optlen,
6335 	    (uchar_t *)optvalp, NULL, cr);
6336 	rw_exit(&icmp->icmp_rwlock);
6337 
6338 	if (error < 0) {
6339 		/*
6340 		 * Pass on to ip
6341 		 */
6342 		error = ip_set_options(connp, level, option_name, optvalp,
6343 		    optlen, cr);
6344 	}
6345 
6346 	ASSERT(error >= 0);
6347 
6348 	return (error);
6349 }
6350 
6351 int
6352 rawip_getsockopt(sock_lower_handle_t proto_handle, int level, int option_name,
6353     void *optvalp, socklen_t *optlen, cred_t *cr)
6354 {
6355 	int		error;
6356 	conn_t		*connp = (conn_t *)proto_handle;
6357 	icmp_t		*icmp = connp->conn_icmp;
6358 	t_uscalar_t	max_optbuf_len;
6359 	void		*optvalp_buf;
6360 	int		len;
6361 
6362 	/* All Solaris components should pass a cred for this operation. */
6363 	ASSERT(cr != NULL);
6364 
6365 	error = proto_opt_check(level, option_name, *optlen, &max_optbuf_len,
6366 	    icmp_opt_obj.odb_opt_des_arr,
6367 	    icmp_opt_obj.odb_opt_arr_cnt,
6368 	    icmp_opt_obj.odb_topmost_tpiprovider,
6369 	    B_FALSE, B_TRUE, cr);
6370 
6371 	if (error != 0) {
6372 		if (error < 0) {
6373 			error = proto_tlitosyserr(-error);
6374 		}
6375 		return (error);
6376 	}
6377 
6378 	optvalp_buf = kmem_alloc(max_optbuf_len, KM_SLEEP);
6379 	rw_enter(&icmp->icmp_rwlock, RW_READER);
6380 	len = icmp_opt_get(connp, level, option_name, optvalp_buf);
6381 	rw_exit(&icmp->icmp_rwlock);
6382 
6383 	if (len < 0) {
6384 		/*
6385 		 * Pass on to IP
6386 		 */
6387 		kmem_free(optvalp_buf, max_optbuf_len);
6388 		return (ip_get_options(connp, level, option_name, optvalp,
6389 		    optlen, cr));
6390 	} else {
6391 		/*
6392 		 * update optlen and copy option value
6393 		 */
6394 		t_uscalar_t size = MIN(len, *optlen);
6395 		bcopy(optvalp_buf, optvalp, size);
6396 		bcopy(&size, optlen, sizeof (size));
6397 
6398 		kmem_free(optvalp_buf, max_optbuf_len);
6399 		return (0);
6400 	}
6401 }
6402 
6403 /* ARGSUSED */
6404 int
6405 rawip_close(sock_lower_handle_t proto_handle, int flags, cred_t *cr)
6406 {
6407 	conn_t	*connp = (conn_t *)proto_handle;
6408 
6409 	/* All Solaris components should pass a cred for this operation. */
6410 	ASSERT(cr != NULL);
6411 
6412 	(void) rawip_do_close(connp);
6413 	return (0);
6414 }
6415 
6416 /* ARGSUSED */
6417 int
6418 rawip_shutdown(sock_lower_handle_t proto_handle, int how, cred_t *cr)
6419 {
6420 	conn_t  *connp = (conn_t *)proto_handle;
6421 
6422 	/* All Solaris components should pass a cred for this operation. */
6423 	ASSERT(cr != NULL);
6424 
6425 	/* shut down the send side */
6426 	if (how != SHUT_RD)
6427 		(*connp->conn_upcalls->su_opctl)(connp->conn_upper_handle,
6428 		    SOCK_OPCTL_SHUT_SEND, 0);
6429 	/* shut down the recv side */
6430 	if (how != SHUT_WR)
6431 		(*connp->conn_upcalls->su_opctl)(connp->conn_upper_handle,
6432 		    SOCK_OPCTL_SHUT_RECV, 0);
6433 	return (0);
6434 }
6435 
6436 void
6437 rawip_clr_flowctrl(sock_lower_handle_t proto_handle)
6438 {
6439 	conn_t  *connp = (conn_t *)proto_handle;
6440 	icmp_t	*icmp = connp->conn_icmp;
6441 
6442 	mutex_enter(&icmp->icmp_recv_lock);
6443 	connp->conn_flow_cntrld = B_FALSE;
6444 	mutex_exit(&icmp->icmp_recv_lock);
6445 }
6446 
6447 int
6448 rawip_ioctl(sock_lower_handle_t proto_handle, int cmd, intptr_t arg,
6449     int mode, int32_t *rvalp, cred_t *cr)
6450 {
6451 	conn_t  	*connp = (conn_t *)proto_handle;
6452 	int		error;
6453 
6454 	/* All Solaris components should pass a cred for this operation. */
6455 	ASSERT(cr != NULL);
6456 
6457 	switch (cmd) {
6458 	case ND_SET:
6459 	case ND_GET:
6460 	case _SIOCSOCKFALLBACK:
6461 	case TI_GETPEERNAME:
6462 	case TI_GETMYNAME:
6463 #ifdef DEBUG
6464 		cmn_err(CE_CONT, "icmp_ioctl cmd 0x%x on non streams"
6465 		    " socket", cmd);
6466 #endif
6467 		error = EINVAL;
6468 		break;
6469 	default:
6470 		/*
6471 		 * Pass on to IP using helper stream
6472 		 */
6473 		error = ldi_ioctl(connp->conn_helper_info->iphs_handle,
6474 		    cmd, arg, mode, cr, rvalp);
6475 		break;
6476 	}
6477 	return (error);
6478 }
6479 
6480 /* ARGSUSED */
6481 int
6482 rawip_send(sock_lower_handle_t proto_handle, mblk_t *mp, struct nmsghdr *msg,
6483     cred_t *cr)
6484 {
6485 	conn_t *connp = (conn_t *)proto_handle;
6486 	icmp_t	*icmp = connp->conn_icmp;
6487 	icmp_stack_t *is = icmp->icmp_is;
6488 	int error = 0;
6489 	boolean_t bypass_dgram_errind = B_FALSE;
6490 
6491 	ASSERT(DB_TYPE(mp) == M_DATA);
6492 
6493 	/* All Solaris components should pass a cred for this operation. */
6494 	ASSERT(cr != NULL);
6495 
6496 	/* If labeled then sockfs should have already set db_credp */
6497 	ASSERT(!is_system_labeled() || msg_getcred(mp, NULL) != NULL);
6498 
6499 	/* do an implicit bind if necessary */
6500 	if (icmp->icmp_state == TS_UNBND) {
6501 		error = rawip_implicit_bind(connp);
6502 		/*
6503 		 * We could be racing with an actual bind, in which case
6504 		 * we would see EPROTO. We cross our fingers and try
6505 		 * to connect.
6506 		 */
6507 		if (!(error == 0 || error == EPROTO)) {
6508 			freemsg(mp);
6509 			return (error);
6510 		}
6511 	}
6512 
6513 	rw_enter(&icmp->icmp_rwlock, RW_WRITER);
6514 
6515 	if (msg->msg_name != NULL && icmp->icmp_state == TS_DATA_XFER) {
6516 		error = EISCONN;
6517 		goto done_lock;
6518 	}
6519 
6520 	switch (icmp->icmp_family) {
6521 	case AF_INET6: {
6522 		sin6_t	*sin6;
6523 		ip6_pkt_t	ipp_s;	/* For ancillary data options */
6524 		ip6_pkt_t	*ipp = &ipp_s;
6525 
6526 		sin6 = (sin6_t *)msg->msg_name;
6527 		if (sin6 != NULL) {
6528 			error = proto_verify_ip_addr(icmp->icmp_family,
6529 			    (struct sockaddr *)msg->msg_name, msg->msg_namelen);
6530 			if (error != 0) {
6531 				bypass_dgram_errind = B_TRUE;
6532 				goto done_lock;
6533 			}
6534 			if (icmp->icmp_delayed_error != 0) {
6535 				sin6_t  *sin1 = (sin6_t *)msg->msg_name;
6536 				sin6_t  *sin2 = (sin6_t *)
6537 				    &icmp->icmp_delayed_addr;
6538 
6539 				error = icmp->icmp_delayed_error;
6540 				icmp->icmp_delayed_error = 0;
6541 
6542 				/* Compare IP address and port */
6543 
6544 				if (sin1->sin6_port == sin2->sin6_port &&
6545 				    IN6_ARE_ADDR_EQUAL(&sin1->sin6_addr,
6546 				    &sin2->sin6_addr)) {
6547 					goto done_lock;
6548 				}
6549 			}
6550 		} else {
6551 			/*
6552 			 * Use connected address
6553 			 */
6554 			if (icmp->icmp_state != TS_DATA_XFER) {
6555 				BUMP_MIB(&is->is_rawip_mib, rawipOutErrors);
6556 				error = EDESTADDRREQ;
6557 				bypass_dgram_errind = B_TRUE;
6558 				goto done_lock;
6559 			}
6560 			sin6 = &icmp->icmp_v6dst;
6561 		}
6562 
6563 		/* No support for mapped addresses on raw sockets */
6564 		if (IN6_IS_ADDR_V4MAPPED(&sin6->sin6_addr)) {
6565 			BUMP_MIB(&is->is_rawip_mib, rawipOutErrors);
6566 			error = EADDRNOTAVAIL;
6567 			goto done_lock;
6568 		}
6569 
6570 		ipp->ipp_fields = 0;
6571 		ipp->ipp_sticky_ignored = 0;
6572 
6573 		/*
6574 		 * If options passed in, feed it for verification and handling
6575 		 */
6576 		if (msg->msg_controllen != 0) {
6577 			error = process_auxiliary_options(connp,
6578 			    msg->msg_control, msg->msg_controllen,
6579 			    ipp, &icmp_opt_obj, icmp_opt_set, cr);
6580 			if (error != 0) {
6581 				goto done_lock;
6582 			}
6583 		}
6584 
6585 		rw_exit(&icmp->icmp_rwlock);
6586 
6587 		/*
6588 		 * Destination is a native IPv6 address.
6589 		 * Send out an IPv6 format packet.
6590 		 */
6591 
6592 		error = raw_ip_send_data_v6(connp->conn_wq, connp, mp, sin6,
6593 		    ipp);
6594 	}
6595 		break;
6596 	case AF_INET: {
6597 		sin_t	*sin;
6598 		ip4_pkt_t pktinfo;
6599 		ip4_pkt_t *pktinfop = &pktinfo;
6600 		ipaddr_t	v4dst;
6601 
6602 		sin = (sin_t *)msg->msg_name;
6603 		if (sin != NULL) {
6604 			error = proto_verify_ip_addr(icmp->icmp_family,
6605 			    (struct sockaddr *)msg->msg_name, msg->msg_namelen);
6606 			if (error != 0) {
6607 				bypass_dgram_errind = B_TRUE;
6608 				goto done_lock;
6609 			}
6610 			v4dst = sin->sin_addr.s_addr;
6611 			if (icmp->icmp_delayed_error != 0) {
6612 				sin_t *sin1 = (sin_t *)msg->msg_name;
6613 				sin_t *sin2 = (sin_t *)&icmp->icmp_delayed_addr;
6614 
6615 				error = icmp->icmp_delayed_error;
6616 				icmp->icmp_delayed_error = 0;
6617 
6618 				/* Compare IP address and port */
6619 				if (sin1->sin_port == sin2->sin_port &&
6620 				    sin1->sin_addr.s_addr ==
6621 				    sin2->sin_addr.s_addr) {
6622 					goto done_lock;
6623 				}
6624 
6625 			}
6626 		} else {
6627 			/*
6628 			 * Use connected address
6629 			 */
6630 			if (icmp->icmp_state != TS_DATA_XFER) {
6631 				BUMP_MIB(&is->is_rawip_mib, rawipOutErrors);
6632 				error = EDESTADDRREQ;
6633 				bypass_dgram_errind = B_TRUE;
6634 				goto done_lock;
6635 			}
6636 			v4dst = V4_PART_OF_V6(icmp->icmp_v6dst.sin6_addr);
6637 		}
6638 
6639 
6640 		pktinfop->ip4_ill_index = 0;
6641 		pktinfop->ip4_addr = INADDR_ANY;
6642 
6643 		/*
6644 		 * If options passed in, feed it for verification and handling
6645 		 */
6646 		if (msg->msg_controllen != 0) {
6647 			error = process_auxiliary_options(connp,
6648 			    msg->msg_control, msg->msg_controllen,
6649 			    pktinfop, &icmp_opt_obj, icmp_opt_set, cr);
6650 			if (error != 0) {
6651 				goto done_lock;
6652 			}
6653 		}
6654 		rw_exit(&icmp->icmp_rwlock);
6655 
6656 		error = raw_ip_send_data_v4(connp->conn_wq, connp, mp,
6657 		    v4dst, pktinfop);
6658 		break;
6659 	}
6660 
6661 	default:
6662 		ASSERT(0);
6663 	}
6664 
6665 	goto done;
6666 
6667 done_lock:
6668 	rw_exit(&icmp->icmp_rwlock);
6669 	if (error != 0) {
6670 		ASSERT(mp != NULL);
6671 		freemsg(mp);
6672 	}
6673 done:
6674 	if (bypass_dgram_errind)
6675 		return (error);
6676 	return (icmp->icmp_dgram_errind ? error : 0);
6677 }
6678 
6679 sock_downcalls_t sock_rawip_downcalls = {
6680 	rawip_activate,
6681 	rawip_accept,
6682 	rawip_bind,
6683 	rawip_listen,
6684 	rawip_connect,
6685 	rawip_getpeername,
6686 	rawip_getsockname,
6687 	rawip_getsockopt,
6688 	rawip_setsockopt,
6689 	rawip_send,
6690 	NULL,
6691 	NULL,
6692 	NULL,
6693 	rawip_shutdown,
6694 	rawip_clr_flowctrl,
6695 	rawip_ioctl,
6696 	rawip_close
6697 };
6698