xref: /titanic_51/usr/src/uts/common/inet/ip/icmp.c (revision 4944376cd5de3dcd3b4feeaad9cbedbc024d1474)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 /* Copyright (c) 1990 Mentat Inc. */
26 
27 #include <sys/types.h>
28 #include <sys/stream.h>
29 #include <sys/stropts.h>
30 #include <sys/strlog.h>
31 #include <sys/strsun.h>
32 #define	_SUN_TPI_VERSION 2
33 #include <sys/tihdr.h>
34 #include <sys/timod.h>
35 #include <sys/ddi.h>
36 #include <sys/sunddi.h>
37 #include <sys/strsubr.h>
38 #include <sys/cmn_err.h>
39 #include <sys/debug.h>
40 #include <sys/kmem.h>
41 #include <sys/policy.h>
42 #include <sys/priv.h>
43 #include <sys/zone.h>
44 #include <sys/time.h>
45 
46 #include <sys/sockio.h>
47 #include <sys/socket.h>
48 #include <sys/socketvar.h>
49 #include <sys/isa_defs.h>
50 #include <sys/suntpi.h>
51 #include <sys/xti_inet.h>
52 #include <sys/netstack.h>
53 
54 #include <net/route.h>
55 #include <net/if.h>
56 
57 #include <netinet/in.h>
58 #include <netinet/ip6.h>
59 #include <netinet/icmp6.h>
60 #include <inet/common.h>
61 #include <inet/ip.h>
62 #include <inet/ip6.h>
63 #include <inet/proto_set.h>
64 #include <inet/nd.h>
65 #include <inet/optcom.h>
66 #include <inet/snmpcom.h>
67 #include <inet/kstatcom.h>
68 #include <inet/rawip_impl.h>
69 
70 #include <netinet/ip_mroute.h>
71 #include <inet/tcp.h>
72 #include <net/pfkeyv2.h>
73 #include <inet/ipsec_info.h>
74 #include <inet/ipclassifier.h>
75 
76 #include <sys/tsol/label.h>
77 #include <sys/tsol/tnet.h>
78 
79 #include <inet/ip_ire.h>
80 #include <inet/ip_if.h>
81 
82 #include <inet/ip_impl.h>
83 #include <sys/disp.h>
84 
85 /*
86  * Synchronization notes:
87  *
88  * RAWIP is MT and uses the usual kernel synchronization primitives. There is
89  * locks, which is icmp_rwlock. We also use conn_lock when updating things
90  * which affect the IP classifier lookup.
91  * The lock order is icmp_rwlock -> conn_lock.
92  *
93  * The icmp_rwlock:
94  * This protects most of the other fields in the icmp_t. The exact list of
95  * fields which are protected by each of the above locks is documented in
96  * the icmp_t structure definition.
97  *
98  * Plumbing notes:
99  * ICMP is always a device driver. For compatibility with mibopen() code
100  * it is possible to I_PUSH "icmp", but that results in pushing a passthrough
101  * dummy module.
102  */
103 
104 static void	icmp_addr_req(queue_t *q, mblk_t *mp);
105 static void	icmp_tpi_bind(queue_t *q, mblk_t *mp);
106 static int	icmp_bind_proto(conn_t *connp);
107 static int	icmp_build_hdrs(icmp_t *icmp);
108 static void	icmp_capability_req(queue_t *q, mblk_t *mp);
109 static int	icmp_close(queue_t *q, int flags);
110 static void	icmp_tpi_connect(queue_t *q, mblk_t *mp);
111 static void	icmp_tpi_disconnect(queue_t *q, mblk_t *mp);
112 static void	icmp_err_ack(queue_t *q, mblk_t *mp, t_scalar_t t_error,
113 		    int sys_error);
114 static void	icmp_err_ack_prim(queue_t *q, mblk_t *mp, t_scalar_t primitive,
115 		    t_scalar_t t_error, int sys_error);
116 static void	icmp_icmp_error(conn_t *connp, mblk_t *mp);
117 static void	icmp_icmp_error_ipv6(conn_t *connp, mblk_t *mp);
118 static void	icmp_info_req(queue_t *q, mblk_t *mp);
119 static void	icmp_input(void *, mblk_t *, void *);
120 static conn_t 	*icmp_open(int family, cred_t *credp, int *err, int flags);
121 static int	icmp_openv4(queue_t *q, dev_t *devp, int flag, int sflag,
122 		    cred_t *credp);
123 static int	icmp_openv6(queue_t *q, dev_t *devp, int flag, int sflag,
124 		    cred_t *credp);
125 static int	icmp_unitdata_opt_process(queue_t *q, mblk_t *mp,
126 		    int *errorp, void *thisdg_attrs);
127 static boolean_t icmp_opt_allow_udr_set(t_scalar_t level, t_scalar_t name);
128 int		icmp_opt_set(conn_t *connp, uint_t optset_context,
129 		    int level, int name, uint_t inlen,
130 		    uchar_t *invalp, uint_t *outlenp, uchar_t *outvalp,
131 		    void *thisdg_attrs, cred_t *cr);
132 int		icmp_opt_get(conn_t *connp, int level, int name,
133 		    uchar_t *ptr);
134 static int	icmp_param_get(queue_t *q, mblk_t *mp, caddr_t cp, cred_t *cr);
135 static boolean_t icmp_param_register(IDP *ndp, icmpparam_t *icmppa, int cnt);
136 static int	icmp_param_set(queue_t *q, mblk_t *mp, char *value,
137 		    caddr_t cp, cred_t *cr);
138 static int	icmp_snmp_set(queue_t *q, t_scalar_t level, t_scalar_t name,
139 		    uchar_t *ptr, int len);
140 static void	icmp_ud_err(queue_t *q, mblk_t *mp, t_scalar_t err);
141 static void	icmp_tpi_unbind(queue_t *q, mblk_t *mp);
142 static int	icmp_update_label(icmp_t *icmp, mblk_t *mp, ipaddr_t dst);
143 static void	icmp_wput(queue_t *q, mblk_t *mp);
144 static void	icmp_wput_fallback(queue_t *q, mblk_t *mp);
145 static int	raw_ip_send_data_v6(queue_t *q, conn_t *connp, mblk_t *mp,
146 		    sin6_t *sin6, ip6_pkt_t *ipp);
147 static int	raw_ip_send_data_v4(queue_t *q, conn_t *connp, mblk_t *mp,
148 		    ipaddr_t v4dst, ip4_pkt_t *pktinfop);
149 static void	icmp_wput_other(queue_t *q, mblk_t *mp);
150 static void	icmp_wput_iocdata(queue_t *q, mblk_t *mp);
151 static void	icmp_wput_restricted(queue_t *q, mblk_t *mp);
152 static void	icmp_ulp_recv(conn_t *, mblk_t *);
153 
154 static void	*rawip_stack_init(netstackid_t stackid, netstack_t *ns);
155 static void	rawip_stack_fini(netstackid_t stackid, void *arg);
156 
157 static void	*rawip_kstat_init(netstackid_t stackid);
158 static void	rawip_kstat_fini(netstackid_t stackid, kstat_t *ksp);
159 static int	rawip_kstat_update(kstat_t *kp, int rw);
160 static void	rawip_stack_shutdown(netstackid_t stackid, void *arg);
161 static int	rawip_do_getsockname(icmp_t *icmp, struct sockaddr *sa,
162 		    uint_t *salenp);
163 static int	rawip_do_getpeername(icmp_t *icmp, struct sockaddr *sa,
164 		    uint_t *salenp);
165 
166 int		rawip_getsockname(sock_lower_handle_t, struct sockaddr *,
167 		    socklen_t *, cred_t *);
168 int		rawip_getpeername(sock_lower_handle_t, struct sockaddr *,
169 		    socklen_t *, cred_t *);
170 
171 static struct module_info icmp_mod_info =  {
172 	5707, "icmp", 1, INFPSZ, 512, 128
173 };
174 
175 /*
176  * Entry points for ICMP as a device.
177  * We have separate open functions for the /dev/icmp and /dev/icmp6 devices.
178  */
179 static struct qinit icmprinitv4 = {
180 	NULL, NULL, icmp_openv4, icmp_close, NULL, &icmp_mod_info
181 };
182 
183 static struct qinit icmprinitv6 = {
184 	NULL, NULL, icmp_openv6, icmp_close, NULL, &icmp_mod_info
185 };
186 
187 static struct qinit icmpwinit = {
188 	(pfi_t)icmp_wput, NULL, NULL, NULL, NULL, &icmp_mod_info
189 };
190 
191 /* ICMP entry point during fallback */
192 static struct qinit icmp_fallback_sock_winit = {
193 	(pfi_t)icmp_wput_fallback, NULL, NULL, NULL, NULL, &icmp_mod_info
194 };
195 
196 /* For AF_INET aka /dev/icmp */
197 struct streamtab icmpinfov4 = {
198 	&icmprinitv4, &icmpwinit
199 };
200 
201 /* For AF_INET6 aka /dev/icmp6 */
202 struct streamtab icmpinfov6 = {
203 	&icmprinitv6, &icmpwinit
204 };
205 
206 static sin_t	sin_null;	/* Zero address for quick clears */
207 static sin6_t	sin6_null;	/* Zero address for quick clears */
208 
209 /* Default structure copied into T_INFO_ACK messages */
210 static struct T_info_ack icmp_g_t_info_ack = {
211 	T_INFO_ACK,
212 	IP_MAXPACKET,	 /* TSDU_size.  icmp allows maximum size messages. */
213 	T_INVALID,	/* ETSDU_size.  icmp does not support expedited data. */
214 	T_INVALID,	/* CDATA_size. icmp does not support connect data. */
215 	T_INVALID,	/* DDATA_size. icmp does not support disconnect data. */
216 	0,		/* ADDR_size - filled in later. */
217 	0,		/* OPT_size - not initialized here */
218 	IP_MAXPACKET,	/* TIDU_size.  icmp allows maximum size messages. */
219 	T_CLTS,		/* SERV_type.  icmp supports connection-less. */
220 	TS_UNBND,	/* CURRENT_state.  This is set from icmp_state. */
221 	(XPG4_1|SENDZERO) /* PROVIDER_flag */
222 };
223 
224 /*
225  * Table of ND variables supported by icmp.  These are loaded into is_nd
226  * when the stack instance is created.
227  * All of these are alterable, within the min/max values given, at run time.
228  */
229 static icmpparam_t	icmp_param_arr[] = {
230 	/* min	max	value	name */
231 	{ 0,	128,	32,	"icmp_wroff_extra" },
232 	{ 1,	255,	255,	"icmp_ipv4_ttl" },
233 	{ 0, IPV6_MAX_HOPS, IPV6_DEFAULT_HOPS,	"icmp_ipv6_hoplimit"},
234 	{ 0,	1,	1,	"icmp_bsd_compat" },
235 	{ 4096,	65536,	8192,	"icmp_xmit_hiwat"},
236 	{ 0,	65536,	1024,	"icmp_xmit_lowat"},
237 	{ 4096,	65536,	8192,	"icmp_recv_hiwat"},
238 	{ 65536, 1024*1024*1024, 256*1024,	"icmp_max_buf"},
239 };
240 #define	is_wroff_extra			is_param_arr[0].icmp_param_value
241 #define	is_ipv4_ttl			is_param_arr[1].icmp_param_value
242 #define	is_ipv6_hoplimit		is_param_arr[2].icmp_param_value
243 #define	is_bsd_compat			is_param_arr[3].icmp_param_value
244 #define	is_xmit_hiwat			is_param_arr[4].icmp_param_value
245 #define	is_xmit_lowat			is_param_arr[5].icmp_param_value
246 #define	is_recv_hiwat			is_param_arr[6].icmp_param_value
247 #define	is_max_buf			is_param_arr[7].icmp_param_value
248 
249 static int rawip_do_bind(conn_t *connp, struct sockaddr *sa, socklen_t len);
250 static int rawip_do_connect(conn_t *connp, const struct sockaddr *sa,
251     socklen_t len, cred_t *cr);
252 static void rawip_post_ip_bind_connect(icmp_t *icmp, mblk_t *ire_mp, int error);
253 
254 /*
255  * This routine is called to handle each O_T_BIND_REQ/T_BIND_REQ message
256  * passed to icmp_wput.
257  * The O_T_BIND_REQ/T_BIND_REQ is passed downstream to ip with the ICMP
258  * protocol type placed in the message following the address. A T_BIND_ACK
259  * message is returned by ip_bind_v4/v6.
260  */
261 static void
262 icmp_tpi_bind(queue_t *q, mblk_t *mp)
263 {
264 	int	error;
265 	struct sockaddr *sa;
266 	struct T_bind_req *tbr;
267 	socklen_t	len;
268 	sin_t	*sin;
269 	sin6_t	*sin6;
270 	icmp_t		*icmp;
271 	conn_t	*connp = Q_TO_CONN(q);
272 	mblk_t *mp1;
273 	cred_t *cr;
274 
275 	/*
276 	 * All Solaris components should pass a db_credp
277 	 * for this TPI message, hence we ASSERT.
278 	 * But in case there is some other M_PROTO that looks
279 	 * like a TPI message sent by some other kernel
280 	 * component, we check and return an error.
281 	 */
282 	cr = msg_getcred(mp, NULL);
283 	ASSERT(cr != NULL);
284 	if (cr == NULL) {
285 		icmp_err_ack(q, mp, TSYSERR, EINVAL);
286 		return;
287 	}
288 
289 	icmp = connp->conn_icmp;
290 	if ((mp->b_wptr - mp->b_rptr) < sizeof (*tbr)) {
291 		(void) mi_strlog(q, 1, SL_ERROR|SL_TRACE,
292 		    "icmp_bind: bad req, len %u",
293 		    (uint_t)(mp->b_wptr - mp->b_rptr));
294 		icmp_err_ack(q, mp, TPROTO, 0);
295 		return;
296 	}
297 
298 	if (icmp->icmp_state != TS_UNBND) {
299 		(void) mi_strlog(q, 1, SL_ERROR|SL_TRACE,
300 		    "icmp_bind: bad state, %d", icmp->icmp_state);
301 		icmp_err_ack(q, mp, TOUTSTATE, 0);
302 		return;
303 	}
304 
305 	/*
306 	 * Reallocate the message to make sure we have enough room for an
307 	 * address and the protocol type.
308 	 */
309 	mp1 = reallocb(mp, sizeof (struct T_bind_ack) + sizeof (sin6_t) + 1, 1);
310 	if (!mp1) {
311 		icmp_err_ack(q, mp, TSYSERR, ENOMEM);
312 		return;
313 	}
314 	mp = mp1;
315 
316 	/* Reset the message type in preparation for shipping it back. */
317 	DB_TYPE(mp) = M_PCPROTO;
318 	tbr = (struct T_bind_req *)mp->b_rptr;
319 	len = tbr->ADDR_length;
320 	switch (len) {
321 	case 0:	/* request for a generic port */
322 		tbr->ADDR_offset = sizeof (struct T_bind_req);
323 		if (icmp->icmp_family == AF_INET) {
324 			tbr->ADDR_length = sizeof (sin_t);
325 			sin = (sin_t *)&tbr[1];
326 			*sin = sin_null;
327 			sin->sin_family = AF_INET;
328 			mp->b_wptr = (uchar_t *)&sin[1];
329 			sa = (struct sockaddr *)sin;
330 			len = sizeof (sin_t);
331 		} else {
332 			ASSERT(icmp->icmp_family == AF_INET6);
333 			tbr->ADDR_length = sizeof (sin6_t);
334 			sin6 = (sin6_t *)&tbr[1];
335 			*sin6 = sin6_null;
336 			sin6->sin6_family = AF_INET6;
337 			mp->b_wptr = (uchar_t *)&sin6[1];
338 			sa = (struct sockaddr *)sin6;
339 			len = sizeof (sin6_t);
340 		}
341 		break;
342 
343 	case sizeof (sin_t):	/* Complete IPv4 address */
344 		sa = (struct sockaddr *)mi_offset_param(mp, tbr->ADDR_offset,
345 		    sizeof (sin_t));
346 		break;
347 
348 	case sizeof (sin6_t):	/* Complete IPv6 address */
349 		sa = (struct sockaddr *)mi_offset_param(mp,
350 		    tbr->ADDR_offset, sizeof (sin6_t));
351 		break;
352 
353 	default:
354 		(void) mi_strlog(q, 1, SL_ERROR|SL_TRACE,
355 		    "icmp_bind: bad ADDR_length %d", tbr->ADDR_length);
356 		icmp_err_ack(q, mp, TBADADDR, 0);
357 		return;
358 	}
359 
360 	error = rawip_do_bind(connp, sa, len);
361 done:
362 	ASSERT(mp->b_cont == NULL);
363 	if (error != 0) {
364 		if (error > 0) {
365 			icmp_err_ack(q, mp, TSYSERR, error);
366 		} else {
367 			icmp_err_ack(q, mp, -error, 0);
368 		}
369 	} else {
370 		tbr->PRIM_type = T_BIND_ACK;
371 		qreply(q, mp);
372 	}
373 }
374 
375 static int
376 rawip_do_bind(conn_t *connp, struct sockaddr *sa, socklen_t len)
377 {
378 	sin_t		*sin;
379 	sin6_t		*sin6;
380 	icmp_t		*icmp;
381 	int		error = 0;
382 	mblk_t		*ire_mp;
383 
384 
385 	icmp = connp->conn_icmp;
386 
387 	if (sa == NULL || !OK_32PTR((char *)sa)) {
388 		return (EINVAL);
389 	}
390 
391 	/*
392 	 * The state must be TS_UNBND. TPI mandates that users must send
393 	 * TPI primitives only 1 at a time and wait for the response before
394 	 * sending the next primitive.
395 	 */
396 	rw_enter(&icmp->icmp_rwlock, RW_WRITER);
397 	if (icmp->icmp_state != TS_UNBND || icmp->icmp_pending_op != -1) {
398 		error = -TOUTSTATE;
399 		goto done;
400 	}
401 
402 	ASSERT(len != 0);
403 	switch (len) {
404 	case sizeof (sin_t):    /* Complete IPv4 address */
405 		sin = (sin_t *)sa;
406 		if (sin->sin_family != AF_INET ||
407 		    icmp->icmp_family != AF_INET) {
408 			/* TSYSERR, EAFNOSUPPORT */
409 			error = EAFNOSUPPORT;
410 			goto done;
411 		}
412 		break;
413 	case sizeof (sin6_t): /* Complete IPv6 address */
414 		sin6 = (sin6_t *)sa;
415 		if (sin6->sin6_family != AF_INET6 ||
416 		    icmp->icmp_family != AF_INET6) {
417 			/* TSYSERR, EAFNOSUPPORT */
418 			error = EAFNOSUPPORT;
419 			goto done;
420 		}
421 		/* No support for mapped addresses on raw sockets */
422 		if (IN6_IS_ADDR_V4MAPPED(&sin6->sin6_addr)) {
423 			/* TSYSERR, EADDRNOTAVAIL */
424 			error = EADDRNOTAVAIL;
425 			goto done;
426 		}
427 		break;
428 
429 	default:
430 		/* TBADADDR */
431 		error = EADDRNOTAVAIL;
432 		goto done;
433 	}
434 
435 	icmp->icmp_pending_op = T_BIND_REQ;
436 	icmp->icmp_state = TS_IDLE;
437 
438 	/*
439 	 * Copy the source address into our icmp structure.  This address
440 	 * may still be zero; if so, ip will fill in the correct address
441 	 * each time an outbound packet is passed to it.
442 	 * If we are binding to a broadcast or multicast address then
443 	 * rawip_post_ip_bind_connect will clear the source address.
444 	 */
445 
446 	if (icmp->icmp_family == AF_INET) {
447 		ASSERT(sin != NULL);
448 		ASSERT(icmp->icmp_ipversion == IPV4_VERSION);
449 		IN6_IPADDR_TO_V4MAPPED(sin->sin_addr.s_addr,
450 		    &icmp->icmp_v6src);
451 		icmp->icmp_max_hdr_len = IP_SIMPLE_HDR_LENGTH +
452 		    icmp->icmp_ip_snd_options_len;
453 		icmp->icmp_bound_v6src = icmp->icmp_v6src;
454 	} else {
455 		int error;
456 
457 		ASSERT(sin6 != NULL);
458 		ASSERT(icmp->icmp_ipversion == IPV6_VERSION);
459 		icmp->icmp_v6src = sin6->sin6_addr;
460 		icmp->icmp_max_hdr_len = icmp->icmp_sticky_hdrs_len;
461 		icmp->icmp_bound_v6src = icmp->icmp_v6src;
462 
463 		/* Rebuild the header template */
464 		error = icmp_build_hdrs(icmp);
465 		if (error != 0) {
466 			icmp->icmp_pending_op = -1;
467 			/*
468 			 * TSYSERR
469 			 */
470 			goto done;
471 		}
472 	}
473 
474 	ire_mp = NULL;
475 	if (!(V6_OR_V4_INADDR_ANY(icmp->icmp_v6src))) {
476 		/*
477 		 * request an IRE if src not 0 (INADDR_ANY)
478 		 */
479 		ire_mp = allocb(sizeof (ire_t), BPRI_HI);
480 		if (ire_mp == NULL) {
481 			icmp->icmp_pending_op = -1;
482 			error = ENOMEM;
483 			goto done;
484 		}
485 		DB_TYPE(ire_mp) = IRE_DB_REQ_TYPE;
486 	}
487 done:
488 	rw_exit(&icmp->icmp_rwlock);
489 	if (error != 0)
490 		return (error);
491 
492 	if (icmp->icmp_family == AF_INET6) {
493 		error = ip_proto_bind_laddr_v6(connp, &ire_mp, icmp->icmp_proto,
494 		    &sin6->sin6_addr, sin6->sin6_port, B_TRUE);
495 	} else {
496 		error = ip_proto_bind_laddr_v4(connp, &ire_mp, icmp->icmp_proto,
497 		    sin->sin_addr.s_addr, sin->sin_port, B_TRUE);
498 	}
499 	rawip_post_ip_bind_connect(icmp, ire_mp, error);
500 	return (error);
501 }
502 
503 static void
504 rawip_post_ip_bind_connect(icmp_t *icmp, mblk_t *ire_mp, int error)
505 {
506 	rw_enter(&icmp->icmp_rwlock, RW_WRITER);
507 	if (icmp->icmp_state == TS_UNBND) {
508 		/*
509 		 * not yet bound - bind sent by icmp_bind_proto.
510 		 */
511 		rw_exit(&icmp->icmp_rwlock);
512 		return;
513 	}
514 	ASSERT(icmp->icmp_pending_op != -1);
515 	icmp->icmp_pending_op = -1;
516 
517 	if (error != 0) {
518 		if (icmp->icmp_state == TS_DATA_XFER) {
519 			/* Connect failed */
520 			/* Revert back to the bound source */
521 			icmp->icmp_v6src = icmp->icmp_bound_v6src;
522 			icmp->icmp_state = TS_IDLE;
523 			if (icmp->icmp_family == AF_INET6)
524 				(void) icmp_build_hdrs(icmp);
525 		} else {
526 			V6_SET_ZERO(icmp->icmp_v6src);
527 			V6_SET_ZERO(icmp->icmp_bound_v6src);
528 			icmp->icmp_state = TS_UNBND;
529 			if (icmp->icmp_family == AF_INET6)
530 				(void) icmp_build_hdrs(icmp);
531 		}
532 	} else {
533 		if (ire_mp != NULL && ire_mp->b_datap->db_type == IRE_DB_TYPE) {
534 			ire_t *ire;
535 
536 			ire = (ire_t *)ire_mp->b_rptr;
537 			/*
538 			 * If a broadcast/multicast address was bound set
539 			 * the source address to 0.
540 			 * This ensures no datagrams with broadcast address
541 			 * as source address are emitted (which would violate
542 			 * RFC1122 - Hosts requirements)
543 			 * Note: we get IRE_BROADCAST for IPv6
544 			 * to "mark" a multicast local address.
545 			 */
546 
547 
548 			if (ire->ire_type == IRE_BROADCAST &&
549 			    icmp->icmp_state != TS_DATA_XFER) {
550 				/*
551 				 * This was just a local bind to a
552 				 * MC/broadcast addr
553 				 */
554 				V6_SET_ZERO(icmp->icmp_v6src);
555 				if (icmp->icmp_family == AF_INET6)
556 					(void) icmp_build_hdrs(icmp);
557 			}
558 		}
559 
560 	}
561 	rw_exit(&icmp->icmp_rwlock);
562 	if (ire_mp != NULL)
563 		freeb(ire_mp);
564 }
565 
566 /*
567  * Send message to IP to just bind to the protocol.
568  */
569 static int
570 icmp_bind_proto(conn_t *connp)
571 {
572 	icmp_t	*icmp;
573 	int	error;
574 
575 	icmp = connp->conn_icmp;
576 
577 	if (icmp->icmp_family == AF_INET6)
578 		error = ip_proto_bind_laddr_v6(connp, NULL, icmp->icmp_proto,
579 		    &sin6_null.sin6_addr, 0, B_TRUE);
580 	else
581 		error = ip_proto_bind_laddr_v4(connp, NULL, icmp->icmp_proto,
582 		    sin_null.sin_addr.s_addr, 0, B_TRUE);
583 
584 	rawip_post_ip_bind_connect(icmp, NULL, error);
585 	return (error);
586 }
587 
588 static void
589 icmp_tpi_connect(queue_t *q, mblk_t *mp)
590 {
591 	conn_t	*connp = Q_TO_CONN(q);
592 	struct T_conn_req	*tcr;
593 	icmp_t	*icmp;
594 	struct sockaddr *sa;
595 	socklen_t len;
596 	int error;
597 	cred_t *cr;
598 
599 	/*
600 	 * All Solaris components should pass a db_credp
601 	 * for this TPI message, hence we ASSERT.
602 	 * But in case there is some other M_PROTO that looks
603 	 * like a TPI message sent by some other kernel
604 	 * component, we check and return an error.
605 	 */
606 	cr = msg_getcred(mp, NULL);
607 	ASSERT(cr != NULL);
608 	if (cr == NULL) {
609 		icmp_err_ack(q, mp, TSYSERR, EINVAL);
610 		return;
611 	}
612 
613 	icmp = connp->conn_icmp;
614 	tcr = (struct T_conn_req *)mp->b_rptr;
615 	/* Sanity checks */
616 	if ((mp->b_wptr - mp->b_rptr) < sizeof (struct T_conn_req)) {
617 		icmp_err_ack(q, mp, TPROTO, 0);
618 		return;
619 	}
620 
621 	if (tcr->OPT_length != 0) {
622 		icmp_err_ack(q, mp, TBADOPT, 0);
623 		return;
624 	}
625 
626 	len = tcr->DEST_length;
627 
628 	switch (len) {
629 	default:
630 		icmp_err_ack(q, mp, TBADADDR, 0);
631 		return;
632 	case sizeof (sin_t):
633 		sa = (struct sockaddr *)mi_offset_param(mp, tcr->DEST_offset,
634 		    sizeof (sin_t));
635 		break;
636 	case sizeof (sin6_t):
637 		sa = (struct sockaddr *)mi_offset_param(mp,
638 		    tcr->DEST_offset, sizeof (sin6_t));
639 		break;
640 	}
641 
642 	error = proto_verify_ip_addr(icmp->icmp_family, sa, len);
643 	if (error != 0) {
644 		icmp_err_ack(q, mp, TSYSERR, error);
645 		return;
646 	}
647 
648 	error = rawip_do_connect(connp, sa, len, cr);
649 	if (error != 0) {
650 		if (error < 0) {
651 			icmp_err_ack(q, mp, -error, 0);
652 		} else {
653 			icmp_err_ack(q, mp, 0, error);
654 		}
655 	} else {
656 		mblk_t *mp1;
657 
658 		/*
659 		 * We have to send a connection confirmation to
660 		 * keep TLI happy.
661 		 */
662 		if (icmp->icmp_family == AF_INET) {
663 			mp1 = mi_tpi_conn_con(NULL, (char *)sa,
664 			    sizeof (sin_t), NULL, 0);
665 		} else {
666 			ASSERT(icmp->icmp_family == AF_INET6);
667 			mp1 = mi_tpi_conn_con(NULL, (char *)sa,
668 			    sizeof (sin6_t), NULL, 0);
669 		}
670 		if (mp1 == NULL) {
671 			rw_exit(&icmp->icmp_rwlock);
672 			icmp_err_ack(q, mp, TSYSERR, ENOMEM);
673 			return;
674 		}
675 
676 		/*
677 		 * Send ok_ack for T_CONN_REQ
678 		 */
679 		mp = mi_tpi_ok_ack_alloc(mp);
680 		if (mp == NULL) {
681 			/* Unable to reuse the T_CONN_REQ for the ack. */
682 			freemsg(mp1);
683 			icmp_err_ack_prim(q, mp1, T_CONN_REQ, TSYSERR, ENOMEM);
684 			return;
685 		}
686 		putnext(connp->conn_rq, mp);
687 		putnext(connp->conn_rq, mp1);
688 	}
689 }
690 
691 static int
692 rawip_do_connect(conn_t *connp, const struct sockaddr *sa, socklen_t len,
693     cred_t *cr)
694 {
695 	icmp_t	*icmp;
696 	sin_t	*sin;
697 	sin6_t	*sin6;
698 	mblk_t  *ire_mp;
699 	int	error;
700 	ipaddr_t	v4dst;
701 	in6_addr_t	v6dst;
702 
703 	icmp = connp->conn_icmp;
704 
705 	if (sa == NULL || !OK_32PTR((char *)sa)) {
706 		return (EINVAL);
707 	}
708 
709 	ire_mp = allocb(sizeof (ire_t), BPRI_HI);
710 	if (ire_mp == NULL)
711 		return (ENOMEM);
712 	DB_TYPE(ire_mp) = IRE_DB_REQ_TYPE;
713 
714 
715 	ASSERT(sa != NULL && len != 0);
716 
717 	rw_enter(&icmp->icmp_rwlock, RW_WRITER);
718 	if (icmp->icmp_state == TS_UNBND || icmp->icmp_pending_op != -1) {
719 		rw_exit(&icmp->icmp_rwlock);
720 		freeb(ire_mp);
721 		return (-TOUTSTATE);
722 	}
723 
724 	switch (len) {
725 	case sizeof (sin_t):
726 		sin = (sin_t *)sa;
727 
728 		ASSERT(icmp->icmp_family == AF_INET);
729 		ASSERT(icmp->icmp_ipversion == IPV4_VERSION);
730 
731 		v4dst = sin->sin_addr.s_addr;
732 		/*
733 		 * Interpret a zero destination to mean loopback.
734 		 * Update the T_CONN_REQ (sin/sin6) since it is used to
735 		 * generate the T_CONN_CON.
736 		 */
737 		if (v4dst == INADDR_ANY) {
738 			v4dst = htonl(INADDR_LOOPBACK);
739 		}
740 
741 		IN6_IPADDR_TO_V4MAPPED(v4dst, &v6dst);
742 		ASSERT(icmp->icmp_ipversion == IPV4_VERSION);
743 		icmp->icmp_max_hdr_len = IP_SIMPLE_HDR_LENGTH +
744 		    icmp->icmp_ip_snd_options_len;
745 		icmp->icmp_v6dst.sin6_addr = v6dst;
746 		icmp->icmp_v6dst.sin6_family = AF_INET6;
747 		icmp->icmp_v6dst.sin6_flowinfo = 0;
748 		icmp->icmp_v6dst.sin6_port = 0;
749 
750 		/*
751 		 * If the destination address is multicast and
752 		 * an outgoing multicast interface has been set,
753 		 * use the address of that interface as our
754 		 * source address if no source address has been set.
755 		 */
756 		if (V4_PART_OF_V6(icmp->icmp_v6src) == INADDR_ANY &&
757 		    CLASSD(v4dst) &&
758 		    icmp->icmp_multicast_if_addr != INADDR_ANY) {
759 			IN6_IPADDR_TO_V4MAPPED(icmp->icmp_multicast_if_addr,
760 			    &icmp->icmp_v6src);
761 		}
762 		break;
763 	case sizeof (sin6_t):
764 		sin6 = (sin6_t *)sa;
765 
766 		/* No support for mapped addresses on raw sockets */
767 		if (IN6_IS_ADDR_V4MAPPED(&sin6->sin6_addr)) {
768 			rw_exit(&icmp->icmp_rwlock);
769 			freeb(ire_mp);
770 			return (EADDRNOTAVAIL);
771 		}
772 
773 		ASSERT(icmp->icmp_ipversion == IPV6_VERSION);
774 		ASSERT(icmp->icmp_family == AF_INET6);
775 
776 		icmp->icmp_max_hdr_len = icmp->icmp_sticky_hdrs_len;
777 
778 		icmp->icmp_v6dst = *sin6;
779 		icmp->icmp_v6dst.sin6_port = 0;
780 
781 		/*
782 		 * Interpret a zero destination to mean loopback.
783 		 * Update the T_CONN_REQ (sin/sin6) since it is used to
784 		 * generate the T_CONN_CON.
785 		 */
786 		if (IN6_IS_ADDR_UNSPECIFIED(&icmp->icmp_v6dst.sin6_addr)) {
787 			icmp->icmp_v6dst.sin6_addr = ipv6_loopback;
788 		}
789 		/*
790 		 * If the destination address is multicast and
791 		 * an outgoing multicast interface has been set,
792 		 * then the ip bind logic will pick the correct source
793 		 * address (i.e. matching the outgoing multicast interface).
794 		 */
795 		break;
796 	}
797 
798 	icmp->icmp_pending_op = T_CONN_REQ;
799 
800 	if (icmp->icmp_state == TS_DATA_XFER) {
801 		/* Already connected - clear out state */
802 		icmp->icmp_v6src = icmp->icmp_bound_v6src;
803 		icmp->icmp_state = TS_IDLE;
804 	}
805 
806 	icmp->icmp_state = TS_DATA_XFER;
807 	rw_exit(&icmp->icmp_rwlock);
808 
809 	if (icmp->icmp_family == AF_INET6) {
810 		error = ip_proto_bind_connected_v6(connp, &ire_mp,
811 		    icmp->icmp_proto, &icmp->icmp_v6src, 0,
812 		    &icmp->icmp_v6dst.sin6_addr,
813 		    NULL, sin6->sin6_port, B_TRUE, B_TRUE, cr);
814 	} else {
815 		error = ip_proto_bind_connected_v4(connp, &ire_mp,
816 		    icmp->icmp_proto, &V4_PART_OF_V6(icmp->icmp_v6src), 0,
817 		    V4_PART_OF_V6(icmp->icmp_v6dst.sin6_addr), sin->sin_port,
818 		    B_TRUE, B_TRUE, cr);
819 	}
820 	rawip_post_ip_bind_connect(icmp, ire_mp, error);
821 	return (error);
822 }
823 
824 static void
825 icmp_close_free(conn_t *connp)
826 {
827 	icmp_t *icmp = connp->conn_icmp;
828 
829 	/* If there are any options associated with the stream, free them. */
830 	if (icmp->icmp_ip_snd_options != NULL) {
831 		mi_free((char *)icmp->icmp_ip_snd_options);
832 		icmp->icmp_ip_snd_options = NULL;
833 		icmp->icmp_ip_snd_options_len = 0;
834 	}
835 
836 	if (icmp->icmp_filter != NULL) {
837 		kmem_free(icmp->icmp_filter, sizeof (icmp6_filter_t));
838 		icmp->icmp_filter = NULL;
839 	}
840 
841 	/* Free memory associated with sticky options */
842 	if (icmp->icmp_sticky_hdrs_len != 0) {
843 		kmem_free(icmp->icmp_sticky_hdrs,
844 		    icmp->icmp_sticky_hdrs_len);
845 		icmp->icmp_sticky_hdrs = NULL;
846 		icmp->icmp_sticky_hdrs_len = 0;
847 	}
848 	ip6_pkt_free(&icmp->icmp_sticky_ipp);
849 
850 	/*
851 	 * Clear any fields which the kmem_cache constructor clears.
852 	 * Only icmp_connp needs to be preserved.
853 	 * TBD: We should make this more efficient to avoid clearing
854 	 * everything.
855 	 */
856 	ASSERT(icmp->icmp_connp == connp);
857 	bzero(icmp, sizeof (icmp_t));
858 	icmp->icmp_connp = connp;
859 }
860 
861 static int
862 rawip_do_close(conn_t *connp)
863 {
864 	ASSERT(connp != NULL && IPCL_IS_RAWIP(connp));
865 
866 	ip_quiesce_conn(connp);
867 
868 	if (!IPCL_IS_NONSTR(connp)) {
869 		qprocsoff(connp->conn_rq);
870 	}
871 
872 	ASSERT(connp->conn_icmp->icmp_fallback_queue_head == NULL &&
873 	    connp->conn_icmp->icmp_fallback_queue_tail == NULL);
874 	icmp_close_free(connp);
875 
876 	/*
877 	 * Now we are truly single threaded on this stream, and can
878 	 * delete the things hanging off the connp, and finally the connp.
879 	 * We removed this connp from the fanout list, it cannot be
880 	 * accessed thru the fanouts, and we already waited for the
881 	 * conn_ref to drop to 0. We are already in close, so
882 	 * there cannot be any other thread from the top. qprocsoff
883 	 * has completed, and service has completed or won't run in
884 	 * future.
885 	 */
886 	ASSERT(connp->conn_ref == 1);
887 
888 	if (!IPCL_IS_NONSTR(connp)) {
889 		inet_minor_free(connp->conn_minor_arena, connp->conn_dev);
890 	} else {
891 		ip_free_helper_stream(connp);
892 	}
893 
894 	connp->conn_ref--;
895 	ipcl_conn_destroy(connp);
896 
897 	return (0);
898 }
899 
900 static int
901 icmp_close(queue_t *q, int flags)
902 {
903 	conn_t  *connp;
904 
905 	if (flags & SO_FALLBACK) {
906 		/*
907 		 * stream is being closed while in fallback
908 		 * simply free the resources that were allocated
909 		 */
910 		inet_minor_free(WR(q)->q_ptr, (dev_t)(RD(q)->q_ptr));
911 		qprocsoff(q);
912 		goto done;
913 	}
914 
915 	connp = Q_TO_CONN(q);
916 	(void) rawip_do_close(connp);
917 done:
918 	q->q_ptr = WR(q)->q_ptr = NULL;
919 	return (0);
920 }
921 
922 /*
923  * This routine handles each T_DISCON_REQ message passed to icmp
924  * as an indicating that ICMP is no longer connected. This results
925  * in sending a T_BIND_REQ to IP to restore the binding to just
926  * the local address.
927  *
928  * The disconnect completes in rawip_post_ip_bind_connect.
929  */
930 static int
931 icmp_do_disconnect(conn_t *connp)
932 {
933 	icmp_t	*icmp;
934 	mblk_t	*ire_mp;
935 	int error;
936 
937 	icmp = connp->conn_icmp;
938 	rw_enter(&icmp->icmp_rwlock, RW_WRITER);
939 	if (icmp->icmp_state != TS_DATA_XFER || icmp->icmp_pending_op != -1) {
940 		rw_exit(&icmp->icmp_rwlock);
941 		return (-TOUTSTATE);
942 	}
943 	icmp->icmp_pending_op = T_DISCON_REQ;
944 	icmp->icmp_v6src = icmp->icmp_bound_v6src;
945 	icmp->icmp_state = TS_IDLE;
946 
947 
948 	if (icmp->icmp_family == AF_INET6) {
949 		/* Rebuild the header template */
950 		error = icmp_build_hdrs(icmp);
951 		if (error != 0) {
952 			icmp->icmp_pending_op = -1;
953 			rw_exit(&icmp->icmp_rwlock);
954 			return (error);
955 		}
956 	}
957 
958 	rw_exit(&icmp->icmp_rwlock);
959 	ire_mp = allocb(sizeof (ire_t), BPRI_HI);
960 	if (ire_mp == NULL) {
961 		return (ENOMEM);
962 	}
963 
964 	if (icmp->icmp_family == AF_INET6) {
965 		error = ip_proto_bind_laddr_v6(connp, &ire_mp, icmp->icmp_proto,
966 		    &icmp->icmp_bound_v6src, 0, B_TRUE);
967 	} else {
968 
969 		error = ip_proto_bind_laddr_v4(connp, &ire_mp, icmp->icmp_proto,
970 		    V4_PART_OF_V6(icmp->icmp_bound_v6src), 0, B_TRUE);
971 	}
972 
973 	rawip_post_ip_bind_connect(icmp, ire_mp, error);
974 
975 	return (error);
976 }
977 
978 static void
979 icmp_tpi_disconnect(queue_t *q, mblk_t *mp)
980 {
981 	conn_t	*connp = Q_TO_CONN(q);
982 	int	error;
983 
984 	/*
985 	 * Allocate the largest primitive we need to send back
986 	 * T_error_ack is > than T_ok_ack
987 	 */
988 	mp = reallocb(mp, sizeof (struct T_error_ack), 1);
989 	if (mp == NULL) {
990 		/* Unable to reuse the T_DISCON_REQ for the ack. */
991 		icmp_err_ack_prim(q, mp, T_DISCON_REQ, TSYSERR, ENOMEM);
992 		return;
993 	}
994 
995 	error = icmp_do_disconnect(connp);
996 
997 	if (error != 0) {
998 		if (error > 0) {
999 			icmp_err_ack(q, mp, 0, error);
1000 		} else {
1001 			icmp_err_ack(q, mp, -error, 0);
1002 		}
1003 	} else {
1004 		mp = mi_tpi_ok_ack_alloc(mp);
1005 		ASSERT(mp != NULL);
1006 		qreply(q, mp);
1007 	}
1008 
1009 }
1010 
1011 static int
1012 icmp_disconnect(conn_t *connp)
1013 {
1014 	int	error;
1015 	icmp_t	*icmp = connp->conn_icmp;
1016 
1017 	icmp->icmp_dgram_errind = B_FALSE;
1018 
1019 	error = icmp_do_disconnect(connp);
1020 
1021 	if (error < 0)
1022 		error = proto_tlitosyserr(-error);
1023 	return (error);
1024 }
1025 
1026 /* This routine creates a T_ERROR_ACK message and passes it upstream. */
1027 static void
1028 icmp_err_ack(queue_t *q, mblk_t *mp, t_scalar_t t_error, int sys_error)
1029 {
1030 	if ((mp = mi_tpi_err_ack_alloc(mp, t_error, sys_error)) != NULL)
1031 		qreply(q, mp);
1032 }
1033 
1034 /* Shorthand to generate and send TPI error acks to our client */
1035 static void
1036 icmp_err_ack_prim(queue_t *q, mblk_t *mp, t_scalar_t primitive,
1037     t_scalar_t t_error, int sys_error)
1038 {
1039 	struct T_error_ack	*teackp;
1040 
1041 	if ((mp = tpi_ack_alloc(mp, sizeof (struct T_error_ack),
1042 	    M_PCPROTO, T_ERROR_ACK)) != NULL) {
1043 		teackp = (struct T_error_ack *)mp->b_rptr;
1044 		teackp->ERROR_prim = primitive;
1045 		teackp->TLI_error = t_error;
1046 		teackp->UNIX_error = sys_error;
1047 		qreply(q, mp);
1048 	}
1049 }
1050 
1051 /*
1052  * icmp_icmp_error is called by icmp_input to process ICMP
1053  * messages passed up by IP.
1054  * Generates the appropriate permanent (non-transient) errors.
1055  * Assumes that IP has pulled up everything up to and including
1056  * the ICMP header.
1057  */
1058 static void
1059 icmp_icmp_error(conn_t *connp, mblk_t *mp)
1060 {
1061 	icmph_t *icmph;
1062 	ipha_t	*ipha;
1063 	int	iph_hdr_length;
1064 	sin_t	sin;
1065 	mblk_t	*mp1;
1066 	int	error = 0;
1067 	icmp_t	*icmp = connp->conn_icmp;
1068 
1069 	ipha = (ipha_t *)mp->b_rptr;
1070 
1071 	ASSERT(OK_32PTR(mp->b_rptr));
1072 
1073 	if (IPH_HDR_VERSION(ipha) != IPV4_VERSION) {
1074 		ASSERT(IPH_HDR_VERSION(ipha) == IPV6_VERSION);
1075 		icmp_icmp_error_ipv6(connp, mp);
1076 		return;
1077 	}
1078 
1079 	/*
1080 	 * icmp does not support v4 mapped addresses
1081 	 * so we can never be here for a V6 socket
1082 	 * i.e. icmp_family == AF_INET6
1083 	 */
1084 	ASSERT((IPH_HDR_VERSION(ipha) == IPV4_VERSION) &&
1085 	    (icmp->icmp_family == AF_INET));
1086 
1087 	ASSERT(icmp->icmp_family == AF_INET);
1088 
1089 	/* Skip past the outer IP and ICMP headers */
1090 	iph_hdr_length = IPH_HDR_LENGTH(ipha);
1091 	icmph = (icmph_t *)(&mp->b_rptr[iph_hdr_length]);
1092 	ipha = (ipha_t *)&icmph[1];
1093 	iph_hdr_length = IPH_HDR_LENGTH(ipha);
1094 
1095 	switch (icmph->icmph_type) {
1096 	case ICMP_DEST_UNREACHABLE:
1097 		switch (icmph->icmph_code) {
1098 		case ICMP_FRAGMENTATION_NEEDED:
1099 			/*
1100 			 * IP has already adjusted the path MTU.
1101 			 */
1102 			break;
1103 		case ICMP_PORT_UNREACHABLE:
1104 		case ICMP_PROTOCOL_UNREACHABLE:
1105 			error = ECONNREFUSED;
1106 			break;
1107 		default:
1108 			/* Transient errors */
1109 			break;
1110 		}
1111 		break;
1112 	default:
1113 		/* Transient errors */
1114 		break;
1115 	}
1116 	if (error == 0) {
1117 		freemsg(mp);
1118 		return;
1119 	}
1120 
1121 	/*
1122 	 * Deliver T_UDERROR_IND when the application has asked for it.
1123 	 * The socket layer enables this automatically when connected.
1124 	 */
1125 	if (!icmp->icmp_dgram_errind) {
1126 		freemsg(mp);
1127 		return;
1128 	}
1129 
1130 	sin = sin_null;
1131 	sin.sin_family = AF_INET;
1132 	sin.sin_addr.s_addr = ipha->ipha_dst;
1133 
1134 	if (IPCL_IS_NONSTR(connp)) {
1135 		rw_enter(&icmp->icmp_rwlock, RW_WRITER);
1136 		if (icmp->icmp_state == TS_DATA_XFER) {
1137 			if (sin.sin_addr.s_addr ==
1138 			    V4_PART_OF_V6(icmp->icmp_v6dst.sin6_addr)) {
1139 				rw_exit(&icmp->icmp_rwlock);
1140 				(*connp->conn_upcalls->su_set_error)
1141 				    (connp->conn_upper_handle, error);
1142 				goto done;
1143 			}
1144 		} else {
1145 			icmp->icmp_delayed_error = error;
1146 			*((sin_t *)&icmp->icmp_delayed_addr) = sin;
1147 		}
1148 		rw_exit(&icmp->icmp_rwlock);
1149 	} else {
1150 		mp1 = mi_tpi_uderror_ind((char *)&sin, sizeof (sin_t), NULL,
1151 		    0, error);
1152 		if (mp1 != NULL)
1153 			putnext(connp->conn_rq, mp1);
1154 	}
1155 done:
1156 	ASSERT(!RW_ISWRITER(&icmp->icmp_rwlock));
1157 	freemsg(mp);
1158 }
1159 
1160 /*
1161  * icmp_icmp_error_ipv6 is called by icmp_icmp_error to process ICMPv6
1162  * for IPv6 packets.
1163  * Send permanent (non-transient) errors upstream.
1164  * Assumes that IP has pulled up all the extension headers as well
1165  * as the ICMPv6 header.
1166  */
1167 static void
1168 icmp_icmp_error_ipv6(conn_t *connp, mblk_t *mp)
1169 {
1170 	icmp6_t		*icmp6;
1171 	ip6_t		*ip6h, *outer_ip6h;
1172 	uint16_t	iph_hdr_length;
1173 	uint8_t		*nexthdrp;
1174 	sin6_t		sin6;
1175 	mblk_t		*mp1;
1176 	int		error = 0;
1177 	icmp_t		*icmp = connp->conn_icmp;
1178 
1179 	outer_ip6h = (ip6_t *)mp->b_rptr;
1180 	if (outer_ip6h->ip6_nxt != IPPROTO_ICMPV6)
1181 		iph_hdr_length = ip_hdr_length_v6(mp, outer_ip6h);
1182 	else
1183 		iph_hdr_length = IPV6_HDR_LEN;
1184 
1185 	icmp6 = (icmp6_t *)&mp->b_rptr[iph_hdr_length];
1186 	ip6h = (ip6_t *)&icmp6[1];
1187 	if (!ip_hdr_length_nexthdr_v6(mp, ip6h, &iph_hdr_length, &nexthdrp)) {
1188 		freemsg(mp);
1189 		return;
1190 	}
1191 
1192 	switch (icmp6->icmp6_type) {
1193 	case ICMP6_DST_UNREACH:
1194 		switch (icmp6->icmp6_code) {
1195 		case ICMP6_DST_UNREACH_NOPORT:
1196 			error = ECONNREFUSED;
1197 			break;
1198 		case ICMP6_DST_UNREACH_ADMIN:
1199 		case ICMP6_DST_UNREACH_NOROUTE:
1200 		case ICMP6_DST_UNREACH_BEYONDSCOPE:
1201 		case ICMP6_DST_UNREACH_ADDR:
1202 			/* Transient errors */
1203 			break;
1204 		default:
1205 			break;
1206 		}
1207 		break;
1208 	case ICMP6_PACKET_TOO_BIG: {
1209 		struct T_unitdata_ind	*tudi;
1210 		struct T_opthdr		*toh;
1211 		size_t			udi_size;
1212 		mblk_t			*newmp;
1213 		t_scalar_t		opt_length = sizeof (struct T_opthdr) +
1214 		    sizeof (struct ip6_mtuinfo);
1215 		sin6_t			*sin6;
1216 		struct ip6_mtuinfo	*mtuinfo;
1217 
1218 		/*
1219 		 * If the application has requested to receive path mtu
1220 		 * information, send up an empty message containing an
1221 		 * IPV6_PATHMTU ancillary data item.
1222 		 */
1223 		if (!icmp->icmp_ipv6_recvpathmtu)
1224 			break;
1225 
1226 		udi_size = sizeof (struct T_unitdata_ind) + sizeof (sin6_t) +
1227 		    opt_length;
1228 		if ((newmp = allocb(udi_size, BPRI_MED)) == NULL) {
1229 			BUMP_MIB(&icmp->icmp_is->is_rawip_mib, rawipInErrors);
1230 			break;
1231 		}
1232 
1233 		/*
1234 		 * newmp->b_cont is left to NULL on purpose.  This is an
1235 		 * empty message containing only ancillary data.
1236 		 */
1237 		newmp->b_datap->db_type = M_PROTO;
1238 		tudi = (struct T_unitdata_ind *)newmp->b_rptr;
1239 		newmp->b_wptr = (uchar_t *)tudi + udi_size;
1240 		tudi->PRIM_type = T_UNITDATA_IND;
1241 		tudi->SRC_length = sizeof (sin6_t);
1242 		tudi->SRC_offset = sizeof (struct T_unitdata_ind);
1243 		tudi->OPT_offset = tudi->SRC_offset + sizeof (sin6_t);
1244 		tudi->OPT_length = opt_length;
1245 
1246 		sin6 = (sin6_t *)&tudi[1];
1247 		bzero(sin6, sizeof (sin6_t));
1248 		sin6->sin6_family = AF_INET6;
1249 		sin6->sin6_addr = icmp->icmp_v6dst.sin6_addr;
1250 
1251 		toh = (struct T_opthdr *)&sin6[1];
1252 		toh->level = IPPROTO_IPV6;
1253 		toh->name = IPV6_PATHMTU;
1254 		toh->len = opt_length;
1255 		toh->status = 0;
1256 
1257 		mtuinfo = (struct ip6_mtuinfo *)&toh[1];
1258 		bzero(mtuinfo, sizeof (struct ip6_mtuinfo));
1259 		mtuinfo->ip6m_addr.sin6_family = AF_INET6;
1260 		mtuinfo->ip6m_addr.sin6_addr = ip6h->ip6_dst;
1261 		mtuinfo->ip6m_mtu = icmp6->icmp6_mtu;
1262 		/*
1263 		 * We've consumed everything we need from the original
1264 		 * message.  Free it, then send our empty message.
1265 		 */
1266 		freemsg(mp);
1267 		icmp_ulp_recv(connp, newmp);
1268 
1269 		return;
1270 	}
1271 	case ICMP6_TIME_EXCEEDED:
1272 		/* Transient errors */
1273 		break;
1274 	case ICMP6_PARAM_PROB:
1275 		/* If this corresponds to an ICMP_PROTOCOL_UNREACHABLE */
1276 		if (icmp6->icmp6_code == ICMP6_PARAMPROB_NEXTHEADER &&
1277 		    (uchar_t *)ip6h + icmp6->icmp6_pptr ==
1278 		    (uchar_t *)nexthdrp) {
1279 			error = ECONNREFUSED;
1280 			break;
1281 		}
1282 		break;
1283 	}
1284 	if (error == 0) {
1285 		freemsg(mp);
1286 		return;
1287 	}
1288 
1289 	/*
1290 	 * Deliver T_UDERROR_IND when the application has asked for it.
1291 	 * The socket layer enables this automatically when connected.
1292 	 */
1293 	if (!icmp->icmp_dgram_errind) {
1294 		freemsg(mp);
1295 		return;
1296 	}
1297 
1298 	sin6 = sin6_null;
1299 	sin6.sin6_family = AF_INET6;
1300 	sin6.sin6_addr = ip6h->ip6_dst;
1301 	sin6.sin6_flowinfo = ip6h->ip6_vcf & ~IPV6_VERS_AND_FLOW_MASK;
1302 
1303 	if (IPCL_IS_NONSTR(connp)) {
1304 		rw_enter(&icmp->icmp_rwlock, RW_WRITER);
1305 		if (icmp->icmp_state == TS_DATA_XFER) {
1306 			if (IN6_ARE_ADDR_EQUAL(&sin6.sin6_addr,
1307 			    &icmp->icmp_v6dst.sin6_addr)) {
1308 				rw_exit(&icmp->icmp_rwlock);
1309 				(*connp->conn_upcalls->su_set_error)
1310 				    (connp->conn_upper_handle, error);
1311 				goto done;
1312 			}
1313 		} else {
1314 			icmp->icmp_delayed_error = error;
1315 			*((sin6_t *)&icmp->icmp_delayed_addr) = sin6;
1316 		}
1317 		rw_exit(&icmp->icmp_rwlock);
1318 	} else {
1319 		mp1 = mi_tpi_uderror_ind((char *)&sin6, sizeof (sin6_t),
1320 		    NULL, 0, error);
1321 		if (mp1 != NULL)
1322 			putnext(connp->conn_rq, mp1);
1323 	}
1324 done:
1325 	ASSERT(!RW_ISWRITER(&icmp->icmp_rwlock));
1326 	freemsg(mp);
1327 }
1328 
1329 /*
1330  * This routine responds to T_ADDR_REQ messages.  It is called by icmp_wput.
1331  * The local address is filled in if endpoint is bound. The remote address
1332  * is filled in if remote address has been precified ("connected endpoint")
1333  * (The concept of connected CLTS sockets is alien to published TPI
1334  *  but we support it anyway).
1335  */
1336 static void
1337 icmp_addr_req(queue_t *q, mblk_t *mp)
1338 {
1339 	icmp_t	*icmp = Q_TO_ICMP(q);
1340 	mblk_t	*ackmp;
1341 	struct T_addr_ack *taa;
1342 
1343 	/* Make it large enough for worst case */
1344 	ackmp = reallocb(mp, sizeof (struct T_addr_ack) +
1345 	    2 * sizeof (sin6_t), 1);
1346 	if (ackmp == NULL) {
1347 		icmp_err_ack(q, mp, TSYSERR, ENOMEM);
1348 		return;
1349 	}
1350 	taa = (struct T_addr_ack *)ackmp->b_rptr;
1351 
1352 	bzero(taa, sizeof (struct T_addr_ack));
1353 	ackmp->b_wptr = (uchar_t *)&taa[1];
1354 
1355 	taa->PRIM_type = T_ADDR_ACK;
1356 	ackmp->b_datap->db_type = M_PCPROTO;
1357 	rw_enter(&icmp->icmp_rwlock, RW_READER);
1358 	/*
1359 	 * Note: Following code assumes 32 bit alignment of basic
1360 	 * data structures like sin_t and struct T_addr_ack.
1361 	 */
1362 	if (icmp->icmp_state != TS_UNBND) {
1363 		/*
1364 		 * Fill in local address
1365 		 */
1366 		taa->LOCADDR_offset = sizeof (*taa);
1367 		if (icmp->icmp_family == AF_INET) {
1368 			sin_t	*sin;
1369 
1370 			taa->LOCADDR_length = sizeof (sin_t);
1371 			sin = (sin_t *)&taa[1];
1372 			/* Fill zeroes and then intialize non-zero fields */
1373 			*sin = sin_null;
1374 			sin->sin_family = AF_INET;
1375 			if (!IN6_IS_ADDR_V4MAPPED_ANY(&icmp->icmp_v6src) &&
1376 			    !IN6_IS_ADDR_UNSPECIFIED(&icmp->icmp_v6src)) {
1377 				IN6_V4MAPPED_TO_IPADDR(&icmp->icmp_v6src,
1378 				    sin->sin_addr.s_addr);
1379 			} else {
1380 				/*
1381 				 * INADDR_ANY
1382 				 * icmp_v6src is not set, we might be bound to
1383 				 * broadcast/multicast. Use icmp_bound_v6src as
1384 				 * local address instead (that could
1385 				 * also still be INADDR_ANY)
1386 				 */
1387 				IN6_V4MAPPED_TO_IPADDR(&icmp->icmp_bound_v6src,
1388 				    sin->sin_addr.s_addr);
1389 			}
1390 			ackmp->b_wptr = (uchar_t *)&sin[1];
1391 		} else {
1392 			sin6_t	*sin6;
1393 
1394 			ASSERT(icmp->icmp_family == AF_INET6);
1395 			taa->LOCADDR_length = sizeof (sin6_t);
1396 			sin6 = (sin6_t *)&taa[1];
1397 			/* Fill zeroes and then intialize non-zero fields */
1398 			*sin6 = sin6_null;
1399 			sin6->sin6_family = AF_INET6;
1400 			if (!IN6_IS_ADDR_UNSPECIFIED(&icmp->icmp_v6src)) {
1401 				sin6->sin6_addr = icmp->icmp_v6src;
1402 			} else {
1403 				/*
1404 				 * UNSPECIFIED
1405 				 * icmp_v6src is not set, we might be bound to
1406 				 * broadcast/multicast. Use icmp_bound_v6src as
1407 				 * local address instead (that could
1408 				 * also still be UNSPECIFIED)
1409 				 */
1410 				sin6->sin6_addr = icmp->icmp_bound_v6src;
1411 			}
1412 			ackmp->b_wptr = (uchar_t *)&sin6[1];
1413 		}
1414 	}
1415 	rw_exit(&icmp->icmp_rwlock);
1416 	ASSERT(ackmp->b_wptr <= ackmp->b_datap->db_lim);
1417 	qreply(q, ackmp);
1418 }
1419 
1420 static void
1421 icmp_copy_info(struct T_info_ack *tap, icmp_t *icmp)
1422 {
1423 	*tap = icmp_g_t_info_ack;
1424 
1425 	if (icmp->icmp_family == AF_INET6)
1426 		tap->ADDR_size = sizeof (sin6_t);
1427 	else
1428 		tap->ADDR_size = sizeof (sin_t);
1429 	tap->CURRENT_state = icmp->icmp_state;
1430 	tap->OPT_size = icmp_max_optsize;
1431 }
1432 
1433 static void
1434 icmp_do_capability_ack(icmp_t *icmp, struct T_capability_ack *tcap,
1435     t_uscalar_t cap_bits1)
1436 {
1437 	tcap->CAP_bits1 = 0;
1438 
1439 	if (cap_bits1 & TC1_INFO) {
1440 		icmp_copy_info(&tcap->INFO_ack, icmp);
1441 		tcap->CAP_bits1 |= TC1_INFO;
1442 	}
1443 }
1444 
1445 /*
1446  * This routine responds to T_CAPABILITY_REQ messages.  It is called by
1447  * icmp_wput.  Much of the T_CAPABILITY_ACK information is copied from
1448  * icmp_g_t_info_ack.  The current state of the stream is copied from
1449  * icmp_state.
1450  */
1451 static void
1452 icmp_capability_req(queue_t *q, mblk_t *mp)
1453 {
1454 	icmp_t			*icmp = Q_TO_ICMP(q);
1455 	t_uscalar_t		cap_bits1;
1456 	struct T_capability_ack	*tcap;
1457 
1458 	cap_bits1 = ((struct T_capability_req *)mp->b_rptr)->CAP_bits1;
1459 
1460 	mp = tpi_ack_alloc(mp, sizeof (struct T_capability_ack),
1461 	    mp->b_datap->db_type, T_CAPABILITY_ACK);
1462 	if (!mp)
1463 		return;
1464 
1465 	tcap = (struct T_capability_ack *)mp->b_rptr;
1466 
1467 	icmp_do_capability_ack(icmp, tcap, cap_bits1);
1468 
1469 	qreply(q, mp);
1470 }
1471 
1472 /*
1473  * This routine responds to T_INFO_REQ messages.  It is called by icmp_wput.
1474  * Most of the T_INFO_ACK information is copied from icmp_g_t_info_ack.
1475  * The current state of the stream is copied from icmp_state.
1476  */
1477 static void
1478 icmp_info_req(queue_t *q, mblk_t *mp)
1479 {
1480 	icmp_t	*icmp = Q_TO_ICMP(q);
1481 
1482 	mp = tpi_ack_alloc(mp, sizeof (struct T_info_ack), M_PCPROTO,
1483 	    T_INFO_ACK);
1484 	if (!mp)
1485 		return;
1486 	icmp_copy_info((struct T_info_ack *)mp->b_rptr, icmp);
1487 	qreply(q, mp);
1488 }
1489 
1490 /* For /dev/icmp aka AF_INET open */
1491 static int
1492 icmp_tpi_open(queue_t *q, dev_t *devp, int flag, int sflag, cred_t *credp,
1493     int family)
1494 {
1495 	conn_t *connp;
1496 	dev_t	conn_dev;
1497 	icmp_stack_t *is;
1498 	int	error;
1499 
1500 	conn_dev = NULL;
1501 
1502 	/* If the stream is already open, return immediately. */
1503 	if (q->q_ptr != NULL)
1504 		return (0);
1505 
1506 	if (sflag == MODOPEN)
1507 		return (EINVAL);
1508 
1509 	/*
1510 	 * Since ICMP is not used so heavily, allocating from the small
1511 	 * arena should be sufficient.
1512 	 */
1513 	if ((conn_dev = inet_minor_alloc(ip_minor_arena_sa)) == 0) {
1514 		return (EBUSY);
1515 	}
1516 
1517 	if (flag & SO_FALLBACK) {
1518 		/*
1519 		 * Non streams socket needs a stream to fallback to
1520 		 */
1521 		RD(q)->q_ptr = (void *)conn_dev;
1522 		WR(q)->q_qinfo = &icmp_fallback_sock_winit;
1523 		WR(q)->q_ptr = (void *)ip_minor_arena_sa;
1524 		qprocson(q);
1525 		return (0);
1526 	}
1527 
1528 	connp = icmp_open(family, credp, &error, KM_SLEEP);
1529 	if (connp == NULL) {
1530 		ASSERT(error != NULL);
1531 		inet_minor_free(ip_minor_arena_sa, connp->conn_dev);
1532 		return (error);
1533 	}
1534 
1535 	*devp = makedevice(getemajor(*devp), (minor_t)conn_dev);
1536 	connp->conn_dev = conn_dev;
1537 	connp->conn_minor_arena = ip_minor_arena_sa;
1538 
1539 	is = connp->conn_icmp->icmp_is;
1540 
1541 	/*
1542 	 * Initialize the icmp_t structure for this stream.
1543 	 */
1544 	q->q_ptr = connp;
1545 	WR(q)->q_ptr = connp;
1546 	connp->conn_rq = q;
1547 	connp->conn_wq = WR(q);
1548 
1549 	if (connp->conn_icmp->icmp_family == AF_INET6) {
1550 		/* Build initial header template for transmit */
1551 		rw_enter(&connp->conn_icmp->icmp_rwlock, RW_WRITER);
1552 		if ((error = icmp_build_hdrs(connp->conn_icmp)) != 0) {
1553 			rw_exit(&connp->conn_icmp->icmp_rwlock);
1554 			inet_minor_free(ip_minor_arena_sa, connp->conn_dev);
1555 			ipcl_conn_destroy(connp);
1556 			return (error);
1557 		}
1558 		rw_exit(&connp->conn_icmp->icmp_rwlock);
1559 	}
1560 
1561 
1562 	q->q_hiwat = is->is_recv_hiwat;
1563 	WR(q)->q_hiwat = is->is_xmit_hiwat;
1564 	WR(q)->q_lowat = is->is_xmit_lowat;
1565 
1566 	qprocson(q);
1567 
1568 	/* Set the Stream head write offset. */
1569 	(void) proto_set_tx_wroff(q, connp,
1570 	    connp->conn_icmp->icmp_max_hdr_len + is->is_wroff_extra);
1571 	(void) proto_set_rx_hiwat(connp->conn_rq, connp, q->q_hiwat);
1572 
1573 	mutex_enter(&connp->conn_lock);
1574 	connp->conn_state_flags &= ~CONN_INCIPIENT;
1575 	mutex_exit(&connp->conn_lock);
1576 
1577 	return (0);
1578 }
1579 
1580 /* For /dev/icmp4 aka AF_INET open */
1581 static int
1582 icmp_openv4(queue_t *q, dev_t *devp, int flag, int sflag, cred_t *credp)
1583 {
1584 	return (icmp_tpi_open(q, devp, flag, sflag, credp, AF_INET));
1585 }
1586 
1587 /* For /dev/icmp6 aka AF_INET6 open */
1588 static int
1589 icmp_openv6(queue_t *q, dev_t *devp, int flag, int sflag, cred_t *credp)
1590 {
1591 	return (icmp_tpi_open(q, devp, flag, sflag, credp, AF_INET6));
1592 }
1593 
1594 /*
1595  * This is the open routine for icmp.  It allocates a icmp_t structure for
1596  * the stream and, on the first open of the module, creates an ND table.
1597  */
1598 /* ARGSUSED */
1599 static conn_t *
1600 icmp_open(int family, cred_t *credp, int *err, int flags)
1601 {
1602 	icmp_t	*icmp;
1603 	conn_t *connp;
1604 	zoneid_t zoneid;
1605 	netstack_t *ns;
1606 	icmp_stack_t *is;
1607 	boolean_t isv6 = B_FALSE;
1608 
1609 	*err = secpolicy_net_icmpaccess(credp);
1610 	if (*err != 0)
1611 		return (NULL);
1612 
1613 	if (family == AF_INET6)
1614 		isv6 = B_TRUE;
1615 	ns = netstack_find_by_cred(credp);
1616 	ASSERT(ns != NULL);
1617 	is = ns->netstack_icmp;
1618 	ASSERT(is != NULL);
1619 
1620 	/*
1621 	 * For exclusive stacks we set the zoneid to zero
1622 	 * to make ICMP operate as if in the global zone.
1623 	 */
1624 	if (ns->netstack_stackid != GLOBAL_NETSTACKID)
1625 		zoneid = GLOBAL_ZONEID;
1626 	else
1627 		zoneid = crgetzoneid(credp);
1628 
1629 	ASSERT(flags == KM_SLEEP || flags == KM_NOSLEEP);
1630 
1631 	connp = ipcl_conn_create(IPCL_RAWIPCONN, flags, ns);
1632 	icmp = connp->conn_icmp;
1633 	icmp->icmp_v6dst = sin6_null;
1634 
1635 	/*
1636 	 * ipcl_conn_create did a netstack_hold. Undo the hold that was
1637 	 * done by netstack_find_by_cred()
1638 	 */
1639 	netstack_rele(ns);
1640 
1641 	rw_enter(&icmp->icmp_rwlock, RW_WRITER);
1642 	ASSERT(connp->conn_ulp == IPPROTO_ICMP);
1643 	ASSERT(connp->conn_icmp == icmp);
1644 	ASSERT(icmp->icmp_connp == connp);
1645 
1646 	/* Set the initial state of the stream and the privilege status. */
1647 	icmp->icmp_state = TS_UNBND;
1648 	if (isv6) {
1649 		icmp->icmp_ipversion = IPV6_VERSION;
1650 		icmp->icmp_family = AF_INET6;
1651 		connp->conn_ulp = IPPROTO_ICMPV6;
1652 		/* May be changed by a SO_PROTOTYPE socket option. */
1653 		icmp->icmp_proto = IPPROTO_ICMPV6;
1654 		icmp->icmp_checksum_off = 2;	/* Offset for icmp6_cksum */
1655 		icmp->icmp_max_hdr_len = IPV6_HDR_LEN;
1656 		icmp->icmp_ttl = (uint8_t)is->is_ipv6_hoplimit;
1657 		connp->conn_af_isv6 = B_TRUE;
1658 		connp->conn_flags |= IPCL_ISV6;
1659 	} else {
1660 		icmp->icmp_ipversion = IPV4_VERSION;
1661 		icmp->icmp_family = AF_INET;
1662 		/* May be changed by a SO_PROTOTYPE socket option. */
1663 		icmp->icmp_proto = IPPROTO_ICMP;
1664 		icmp->icmp_max_hdr_len = IP_SIMPLE_HDR_LENGTH;
1665 		icmp->icmp_ttl = (uint8_t)is->is_ipv4_ttl;
1666 		connp->conn_af_isv6 = B_FALSE;
1667 		connp->conn_flags &= ~IPCL_ISV6;
1668 	}
1669 	icmp->icmp_multicast_ttl = IP_DEFAULT_MULTICAST_TTL;
1670 	icmp->icmp_pending_op = -1;
1671 	connp->conn_multicast_loop = IP_DEFAULT_MULTICAST_LOOP;
1672 	connp->conn_zoneid = zoneid;
1673 
1674 	/*
1675 	 * If the caller has the process-wide flag set, then default to MAC
1676 	 * exempt mode.  This allows read-down to unlabeled hosts.
1677 	 */
1678 	if (getpflags(NET_MAC_AWARE, credp) != 0)
1679 		connp->conn_mac_exempt = B_TRUE;
1680 
1681 	connp->conn_ulp_labeled = is_system_labeled();
1682 
1683 	icmp->icmp_is = is;
1684 
1685 	connp->conn_recv = icmp_input;
1686 	crhold(credp);
1687 	connp->conn_cred = credp;
1688 
1689 	rw_exit(&icmp->icmp_rwlock);
1690 
1691 	connp->conn_flow_cntrld = B_FALSE;
1692 	return (connp);
1693 }
1694 
1695 /*
1696  * Which ICMP options OK to set through T_UNITDATA_REQ...
1697  */
1698 /* ARGSUSED */
1699 static boolean_t
1700 icmp_opt_allow_udr_set(t_scalar_t level, t_scalar_t name)
1701 {
1702 	return (B_TRUE);
1703 }
1704 
1705 /*
1706  * This routine gets default values of certain options whose default
1707  * values are maintained by protcol specific code
1708  */
1709 /* ARGSUSED */
1710 int
1711 icmp_opt_default(queue_t *q, int level, int name, uchar_t *ptr)
1712 {
1713 	icmp_t *icmp = Q_TO_ICMP(q);
1714 	icmp_stack_t *is = icmp->icmp_is;
1715 	int *i1 = (int *)ptr;
1716 
1717 	switch (level) {
1718 	case IPPROTO_IP:
1719 		switch (name) {
1720 		case IP_MULTICAST_TTL:
1721 			*ptr = (uchar_t)IP_DEFAULT_MULTICAST_TTL;
1722 			return (sizeof (uchar_t));
1723 		case IP_MULTICAST_LOOP:
1724 			*ptr = (uchar_t)IP_DEFAULT_MULTICAST_LOOP;
1725 			return (sizeof (uchar_t));
1726 		}
1727 		break;
1728 	case IPPROTO_IPV6:
1729 		switch (name) {
1730 		case IPV6_MULTICAST_HOPS:
1731 			*i1 = IP_DEFAULT_MULTICAST_TTL;
1732 			return (sizeof (int));
1733 		case IPV6_MULTICAST_LOOP:
1734 			*i1 = IP_DEFAULT_MULTICAST_LOOP;
1735 			return (sizeof (int));
1736 		case IPV6_UNICAST_HOPS:
1737 			*i1 = is->is_ipv6_hoplimit;
1738 			return (sizeof (int));
1739 		}
1740 		break;
1741 	case IPPROTO_ICMPV6:
1742 		switch (name) {
1743 		case ICMP6_FILTER:
1744 			/* Make it look like "pass all" */
1745 			ICMP6_FILTER_SETPASSALL((icmp6_filter_t *)ptr);
1746 			return (sizeof (icmp6_filter_t));
1747 		}
1748 		break;
1749 	}
1750 	return (-1);
1751 }
1752 
1753 /*
1754  * This routine retrieves the current status of socket options.
1755  * It returns the size of the option retrieved.
1756  */
1757 int
1758 icmp_opt_get(conn_t *connp, int level, int name, uchar_t *ptr)
1759 {
1760 	icmp_t		*icmp = connp->conn_icmp;
1761 	icmp_stack_t	*is = icmp->icmp_is;
1762 	int		*i1 = (int *)ptr;
1763 	ip6_pkt_t	*ipp = &icmp->icmp_sticky_ipp;
1764 	int		ret = 0;
1765 
1766 	ASSERT(RW_READ_HELD(&icmp->icmp_rwlock));
1767 	switch (level) {
1768 	case SOL_SOCKET:
1769 		switch (name) {
1770 		case SO_DEBUG:
1771 			*i1 = icmp->icmp_debug;
1772 			break;
1773 		case SO_TYPE:
1774 			*i1 = SOCK_RAW;
1775 			break;
1776 		case SO_PROTOTYPE:
1777 			*i1 = icmp->icmp_proto;
1778 			break;
1779 		case SO_REUSEADDR:
1780 			*i1 = icmp->icmp_reuseaddr;
1781 			break;
1782 
1783 		/*
1784 		 * The following three items are available here,
1785 		 * but are only meaningful to IP.
1786 		 */
1787 		case SO_DONTROUTE:
1788 			*i1 = icmp->icmp_dontroute;
1789 			break;
1790 		case SO_USELOOPBACK:
1791 			*i1 = icmp->icmp_useloopback;
1792 			break;
1793 		case SO_BROADCAST:
1794 			*i1 = icmp->icmp_broadcast;
1795 			break;
1796 
1797 		case SO_SNDBUF:
1798 			ASSERT(icmp->icmp_xmit_hiwat <= INT_MAX);
1799 			*i1 = icmp->icmp_xmit_hiwat;
1800 			break;
1801 		case SO_RCVBUF:
1802 			ASSERT(icmp->icmp_recv_hiwat <= INT_MAX);
1803 			*i1 = icmp->icmp_recv_hiwat;
1804 			break;
1805 		case SO_DGRAM_ERRIND:
1806 			*i1 = icmp->icmp_dgram_errind;
1807 			break;
1808 		case SO_TIMESTAMP:
1809 			*i1 = icmp->icmp_timestamp;
1810 			break;
1811 		case SO_MAC_EXEMPT:
1812 			*i1 = connp->conn_mac_exempt;
1813 			break;
1814 		case SO_DOMAIN:
1815 			*i1 = icmp->icmp_family;
1816 			break;
1817 
1818 		/*
1819 		 * Following four not meaningful for icmp
1820 		 * Action is same as "default" to which we fallthrough
1821 		 * so we keep them in comments.
1822 		 * case SO_LINGER:
1823 		 * case SO_KEEPALIVE:
1824 		 * case SO_OOBINLINE:
1825 		 * case SO_ALLZONES:
1826 		 */
1827 		default:
1828 			ret = -1;
1829 			goto done;
1830 		}
1831 		break;
1832 	case IPPROTO_IP:
1833 		/*
1834 		 * Only allow IPv4 option processing on IPv4 sockets.
1835 		 */
1836 		if (icmp->icmp_family != AF_INET) {
1837 			ret = -1;
1838 			goto done;
1839 		}
1840 
1841 		switch (name) {
1842 		case IP_OPTIONS:
1843 		case T_IP_OPTIONS:
1844 			/* Options are passed up with each packet */
1845 			ret = 0;
1846 			goto done;
1847 		case IP_HDRINCL:
1848 			*i1 = (int)icmp->icmp_hdrincl;
1849 			break;
1850 		case IP_TOS:
1851 		case T_IP_TOS:
1852 			*i1 = (int)icmp->icmp_type_of_service;
1853 			break;
1854 		case IP_TTL:
1855 			*i1 = (int)icmp->icmp_ttl;
1856 			break;
1857 		case IP_MULTICAST_IF:
1858 			/* 0 address if not set */
1859 			*(ipaddr_t *)ptr = icmp->icmp_multicast_if_addr;
1860 			ret = sizeof (ipaddr_t);
1861 			goto done;
1862 		case IP_MULTICAST_TTL:
1863 			*(uchar_t *)ptr = icmp->icmp_multicast_ttl;
1864 			ret = sizeof (uchar_t);
1865 			goto done;
1866 		case IP_MULTICAST_LOOP:
1867 			*ptr = connp->conn_multicast_loop;
1868 			ret = sizeof (uint8_t);
1869 			goto done;
1870 		case IP_BOUND_IF:
1871 			/* Zero if not set */
1872 			*i1 = icmp->icmp_bound_if;
1873 			break;	/* goto sizeof (int) option return */
1874 		case IP_UNSPEC_SRC:
1875 			*ptr = icmp->icmp_unspec_source;
1876 			break;	/* goto sizeof (int) option return */
1877 		case IP_RECVIF:
1878 			*ptr = icmp->icmp_recvif;
1879 			break;	/* goto sizeof (int) option return */
1880 		case IP_BROADCAST_TTL:
1881 			*(uchar_t *)ptr = connp->conn_broadcast_ttl;
1882 			return (sizeof (uchar_t));
1883 		case IP_RECVPKTINFO:
1884 			/*
1885 			 * This also handles IP_PKTINFO.
1886 			 * IP_PKTINFO and IP_RECVPKTINFO have the same value.
1887 			 * Differentiation is based on the size of the argument
1888 			 * passed in.
1889 			 * This option is handled in IP which will return an
1890 			 * error for IP_PKTINFO as it's not supported as a
1891 			 * sticky option.
1892 			 */
1893 			ret = -EINVAL;
1894 			goto done;
1895 		/*
1896 		 * Cannot "get" the value of following options
1897 		 * at this level. Action is same as "default" to
1898 		 * which we fallthrough so we keep them in comments.
1899 		 *
1900 		 * case IP_ADD_MEMBERSHIP:
1901 		 * case IP_DROP_MEMBERSHIP:
1902 		 * case IP_BLOCK_SOURCE:
1903 		 * case IP_UNBLOCK_SOURCE:
1904 		 * case IP_ADD_SOURCE_MEMBERSHIP:
1905 		 * case IP_DROP_SOURCE_MEMBERSHIP:
1906 		 * case MCAST_JOIN_GROUP:
1907 		 * case MCAST_LEAVE_GROUP:
1908 		 * case MCAST_BLOCK_SOURCE:
1909 		 * case MCAST_UNBLOCK_SOURCE:
1910 		 * case MCAST_JOIN_SOURCE_GROUP:
1911 		 * case MCAST_LEAVE_SOURCE_GROUP:
1912 		 * case MRT_INIT:
1913 		 * case MRT_DONE:
1914 		 * case MRT_ADD_VIF:
1915 		 * case MRT_DEL_VIF:
1916 		 * case MRT_ADD_MFC:
1917 		 * case MRT_DEL_MFC:
1918 		 * case MRT_VERSION:
1919 		 * case MRT_ASSERT:
1920 		 * case IP_SEC_OPT:
1921 		 * case IP_NEXTHOP:
1922 		 */
1923 		default:
1924 			ret = -1;
1925 			goto done;
1926 		}
1927 		break;
1928 	case IPPROTO_IPV6:
1929 		/*
1930 		 * Only allow IPv6 option processing on native IPv6 sockets.
1931 		 */
1932 		if (icmp->icmp_family != AF_INET6) {
1933 			ret = -1;
1934 			goto done;
1935 		}
1936 		switch (name) {
1937 		case IPV6_UNICAST_HOPS:
1938 			*i1 = (unsigned int)icmp->icmp_ttl;
1939 			break;
1940 		case IPV6_MULTICAST_IF:
1941 			/* 0 index if not set */
1942 			*i1 = icmp->icmp_multicast_if_index;
1943 			break;
1944 		case IPV6_MULTICAST_HOPS:
1945 			*i1 = icmp->icmp_multicast_ttl;
1946 			break;
1947 		case IPV6_MULTICAST_LOOP:
1948 			*i1 = connp->conn_multicast_loop;
1949 			break;
1950 		case IPV6_BOUND_IF:
1951 			/* Zero if not set */
1952 			*i1 = icmp->icmp_bound_if;
1953 			break;
1954 		case IPV6_UNSPEC_SRC:
1955 			*i1 = icmp->icmp_unspec_source;
1956 			break;
1957 		case IPV6_CHECKSUM:
1958 			/*
1959 			 * Return offset or -1 if no checksum offset.
1960 			 * Does not apply to IPPROTO_ICMPV6
1961 			 */
1962 			if (icmp->icmp_proto == IPPROTO_ICMPV6) {
1963 				ret = -1;
1964 				goto done;
1965 			}
1966 
1967 			if (icmp->icmp_raw_checksum) {
1968 				*i1 = icmp->icmp_checksum_off;
1969 			} else {
1970 				*i1 = -1;
1971 			}
1972 			break;
1973 		case IPV6_JOIN_GROUP:
1974 		case IPV6_LEAVE_GROUP:
1975 		case MCAST_JOIN_GROUP:
1976 		case MCAST_LEAVE_GROUP:
1977 		case MCAST_BLOCK_SOURCE:
1978 		case MCAST_UNBLOCK_SOURCE:
1979 		case MCAST_JOIN_SOURCE_GROUP:
1980 		case MCAST_LEAVE_SOURCE_GROUP:
1981 			/* cannot "get" the value for these */
1982 			ret = -1;
1983 			goto done;
1984 		case IPV6_RECVPKTINFO:
1985 			*i1 = icmp->icmp_ip_recvpktinfo;
1986 			break;
1987 		case IPV6_RECVTCLASS:
1988 			*i1 = icmp->icmp_ipv6_recvtclass;
1989 			break;
1990 		case IPV6_RECVPATHMTU:
1991 			*i1 = icmp->icmp_ipv6_recvpathmtu;
1992 			break;
1993 		case IPV6_V6ONLY:
1994 			*i1 = 1;
1995 			break;
1996 		case IPV6_RECVHOPLIMIT:
1997 			*i1 = icmp->icmp_ipv6_recvhoplimit;
1998 			break;
1999 		case IPV6_RECVHOPOPTS:
2000 			*i1 = icmp->icmp_ipv6_recvhopopts;
2001 			break;
2002 		case IPV6_RECVDSTOPTS:
2003 			*i1 = icmp->icmp_ipv6_recvdstopts;
2004 			break;
2005 		case _OLD_IPV6_RECVDSTOPTS:
2006 			*i1 = icmp->icmp_old_ipv6_recvdstopts;
2007 			break;
2008 		case IPV6_RECVRTHDRDSTOPTS:
2009 			*i1 = icmp->icmp_ipv6_recvrtdstopts;
2010 			break;
2011 		case IPV6_RECVRTHDR:
2012 			*i1 = icmp->icmp_ipv6_recvrthdr;
2013 			break;
2014 		case IPV6_PKTINFO: {
2015 			/* XXX assumes that caller has room for max size! */
2016 			struct in6_pktinfo *pkti;
2017 
2018 			pkti = (struct in6_pktinfo *)ptr;
2019 			if (ipp->ipp_fields & IPPF_IFINDEX)
2020 				pkti->ipi6_ifindex = ipp->ipp_ifindex;
2021 			else
2022 				pkti->ipi6_ifindex = 0;
2023 			if (ipp->ipp_fields & IPPF_ADDR)
2024 				pkti->ipi6_addr = ipp->ipp_addr;
2025 			else
2026 				pkti->ipi6_addr = ipv6_all_zeros;
2027 			ret = sizeof (struct in6_pktinfo);
2028 			goto done;
2029 		}
2030 		case IPV6_NEXTHOP: {
2031 			sin6_t *sin6 = (sin6_t *)ptr;
2032 
2033 			if (!(ipp->ipp_fields & IPPF_NEXTHOP))
2034 				return (0);
2035 			*sin6 = sin6_null;
2036 			sin6->sin6_family = AF_INET6;
2037 			sin6->sin6_addr = ipp->ipp_nexthop;
2038 			ret = (sizeof (sin6_t));
2039 			goto done;
2040 		}
2041 		case IPV6_HOPOPTS:
2042 			if (!(ipp->ipp_fields & IPPF_HOPOPTS))
2043 				return (0);
2044 			if (ipp->ipp_hopoptslen <= icmp->icmp_label_len_v6)
2045 				return (0);
2046 			bcopy((char *)ipp->ipp_hopopts +
2047 			    icmp->icmp_label_len_v6, ptr,
2048 			    ipp->ipp_hopoptslen - icmp->icmp_label_len_v6);
2049 			if (icmp->icmp_label_len_v6 > 0) {
2050 				ptr[0] = ((char *)ipp->ipp_hopopts)[0];
2051 				ptr[1] = (ipp->ipp_hopoptslen -
2052 				    icmp->icmp_label_len_v6 + 7) / 8 - 1;
2053 			}
2054 			ret = (ipp->ipp_hopoptslen - icmp->icmp_label_len_v6);
2055 			goto done;
2056 		case IPV6_RTHDRDSTOPTS:
2057 			if (!(ipp->ipp_fields & IPPF_RTDSTOPTS))
2058 				return (0);
2059 			bcopy(ipp->ipp_rtdstopts, ptr, ipp->ipp_rtdstoptslen);
2060 			ret = ipp->ipp_rtdstoptslen;
2061 			goto done;
2062 		case IPV6_RTHDR:
2063 			if (!(ipp->ipp_fields & IPPF_RTHDR))
2064 				return (0);
2065 			bcopy(ipp->ipp_rthdr, ptr, ipp->ipp_rthdrlen);
2066 			ret = ipp->ipp_rthdrlen;
2067 			goto done;
2068 		case IPV6_DSTOPTS:
2069 			if (!(ipp->ipp_fields & IPPF_DSTOPTS)) {
2070 				ret = 0;
2071 				goto done;
2072 			}
2073 			bcopy(ipp->ipp_dstopts, ptr, ipp->ipp_dstoptslen);
2074 			ret = ipp->ipp_dstoptslen;
2075 			goto done;
2076 		case IPV6_PATHMTU:
2077 			if (!(ipp->ipp_fields & IPPF_PATHMTU)) {
2078 				ret = 0;
2079 			} else {
2080 				ret = ip_fill_mtuinfo(
2081 				    &icmp->icmp_v6dst.sin6_addr, 0,
2082 				    (struct ip6_mtuinfo *)ptr,
2083 				    is->is_netstack);
2084 			}
2085 			goto done;
2086 		case IPV6_TCLASS:
2087 			if (ipp->ipp_fields & IPPF_TCLASS)
2088 				*i1 = ipp->ipp_tclass;
2089 			else
2090 				*i1 = IPV6_FLOW_TCLASS(
2091 				    IPV6_DEFAULT_VERS_AND_FLOW);
2092 			break;
2093 		default:
2094 			ret = -1;
2095 			goto done;
2096 		}
2097 		break;
2098 	case IPPROTO_ICMPV6:
2099 		/*
2100 		 * Only allow IPv6 option processing on native IPv6 sockets.
2101 		 */
2102 		if (icmp->icmp_family != AF_INET6) {
2103 			ret = -1;
2104 		}
2105 
2106 		if (icmp->icmp_proto != IPPROTO_ICMPV6) {
2107 			ret = -1;
2108 		}
2109 
2110 		switch (name) {
2111 		case ICMP6_FILTER:
2112 			if (icmp->icmp_filter == NULL) {
2113 				/* Make it look like "pass all" */
2114 				ICMP6_FILTER_SETPASSALL((icmp6_filter_t *)ptr);
2115 			} else {
2116 				(void) bcopy(icmp->icmp_filter, ptr,
2117 				    sizeof (icmp6_filter_t));
2118 			}
2119 			ret = sizeof (icmp6_filter_t);
2120 			goto done;
2121 		default:
2122 			ret = -1;
2123 			goto done;
2124 		}
2125 	default:
2126 		ret = -1;
2127 		goto done;
2128 	}
2129 	ret = sizeof (int);
2130 done:
2131 	return (ret);
2132 }
2133 
2134 /*
2135  * This routine retrieves the current status of socket options.
2136  * It returns the size of the option retrieved.
2137  */
2138 int
2139 icmp_tpi_opt_get(queue_t *q, int level, int name, uchar_t *ptr)
2140 {
2141 	conn_t  *connp = Q_TO_CONN(q);
2142 	icmp_t	*icmp = connp->conn_icmp;
2143 	int 	err;
2144 
2145 	rw_enter(&icmp->icmp_rwlock, RW_READER);
2146 	err = icmp_opt_get(connp, level, name, ptr);
2147 	rw_exit(&icmp->icmp_rwlock);
2148 	return (err);
2149 }
2150 
2151 int
2152 icmp_do_opt_set(conn_t *connp, int level, int name, uint_t inlen,
2153     uchar_t *invalp, uint_t *outlenp, uchar_t *outvalp, cred_t *cr,
2154     void *thisdg_attrs, boolean_t checkonly)
2155 {
2156 
2157 	int	*i1 = (int *)invalp;
2158 	boolean_t onoff = (*i1 == 0) ? 0 : 1;
2159 	icmp_t *icmp = connp->conn_icmp;
2160 	icmp_stack_t *is = icmp->icmp_is;
2161 	int	error;
2162 
2163 	ASSERT(RW_WRITE_HELD(&icmp->icmp_rwlock));
2164 	/*
2165 	 * For fixed length options, no sanity check
2166 	 * of passed in length is done. It is assumed *_optcom_req()
2167 	 * routines do the right thing.
2168 	 */
2169 	switch (level) {
2170 	case SOL_SOCKET:
2171 		switch (name) {
2172 		case SO_DEBUG:
2173 			if (!checkonly)
2174 				icmp->icmp_debug = onoff;
2175 			break;
2176 		case SO_PROTOTYPE:
2177 			if ((*i1 & 0xFF) != IPPROTO_ICMP &&
2178 			    (*i1 & 0xFF) != IPPROTO_ICMPV6 &&
2179 			    secpolicy_net_rawaccess(cr) != 0) {
2180 				*outlenp = 0;
2181 				return (EACCES);
2182 			}
2183 			/* Can't use IPPROTO_RAW with IPv6 */
2184 			if ((*i1 & 0xFF) == IPPROTO_RAW &&
2185 			    icmp->icmp_family == AF_INET6) {
2186 				*outlenp = 0;
2187 				return (EPROTONOSUPPORT);
2188 			}
2189 			if (checkonly) {
2190 				/* T_CHECK case */
2191 				*(int *)outvalp = (*i1 & 0xFF);
2192 				break;
2193 			}
2194 			icmp->icmp_proto = *i1 & 0xFF;
2195 			if ((icmp->icmp_proto == IPPROTO_RAW ||
2196 			    icmp->icmp_proto == IPPROTO_IGMP) &&
2197 			    icmp->icmp_family == AF_INET)
2198 				icmp->icmp_hdrincl = 1;
2199 			else
2200 				icmp->icmp_hdrincl = 0;
2201 
2202 			if (icmp->icmp_family == AF_INET6 &&
2203 			    icmp->icmp_proto == IPPROTO_ICMPV6) {
2204 				/* Set offset for icmp6_cksum */
2205 				icmp->icmp_raw_checksum = 0;
2206 				icmp->icmp_checksum_off = 2;
2207 			}
2208 			if (icmp->icmp_proto == IPPROTO_UDP ||
2209 			    icmp->icmp_proto == IPPROTO_TCP ||
2210 			    icmp->icmp_proto == IPPROTO_SCTP) {
2211 				icmp->icmp_no_tp_cksum = 1;
2212 				icmp->icmp_sticky_ipp.ipp_fields |=
2213 				    IPPF_NO_CKSUM;
2214 			} else {
2215 				icmp->icmp_no_tp_cksum = 0;
2216 				icmp->icmp_sticky_ipp.ipp_fields &=
2217 				    ~IPPF_NO_CKSUM;
2218 			}
2219 
2220 			if (icmp->icmp_filter != NULL &&
2221 			    icmp->icmp_proto != IPPROTO_ICMPV6) {
2222 				kmem_free(icmp->icmp_filter,
2223 				    sizeof (icmp6_filter_t));
2224 				icmp->icmp_filter = NULL;
2225 			}
2226 
2227 			/* Rebuild the header template */
2228 			error = icmp_build_hdrs(icmp);
2229 			if (error != 0) {
2230 				*outlenp = 0;
2231 				return (error);
2232 			}
2233 
2234 			/*
2235 			 * For SCTP, we don't use icmp_bind_proto() for
2236 			 * raw socket binding.  Note that we do not need
2237 			 * to set *outlenp.
2238 			 * FIXME: how does SCTP work?
2239 			 */
2240 			if (icmp->icmp_proto == IPPROTO_SCTP)
2241 				return (0);
2242 
2243 			*outlenp = sizeof (int);
2244 			*(int *)outvalp = *i1 & 0xFF;
2245 
2246 			/* Drop lock across the bind operation */
2247 			rw_exit(&icmp->icmp_rwlock);
2248 			(void) icmp_bind_proto(connp);
2249 			rw_enter(&icmp->icmp_rwlock, RW_WRITER);
2250 			return (0);
2251 		case SO_REUSEADDR:
2252 			if (!checkonly) {
2253 				icmp->icmp_reuseaddr = onoff;
2254 				PASS_OPT_TO_IP(connp);
2255 			}
2256 			break;
2257 
2258 		/*
2259 		 * The following three items are available here,
2260 		 * but are only meaningful to IP.
2261 		 */
2262 		case SO_DONTROUTE:
2263 			if (!checkonly) {
2264 				icmp->icmp_dontroute = onoff;
2265 				PASS_OPT_TO_IP(connp);
2266 			}
2267 			break;
2268 		case SO_USELOOPBACK:
2269 			if (!checkonly) {
2270 				icmp->icmp_useloopback = onoff;
2271 				PASS_OPT_TO_IP(connp);
2272 			}
2273 			break;
2274 		case SO_BROADCAST:
2275 			if (!checkonly) {
2276 				icmp->icmp_broadcast = onoff;
2277 				PASS_OPT_TO_IP(connp);
2278 			}
2279 			break;
2280 
2281 		case SO_SNDBUF:
2282 			if (*i1 > is->is_max_buf) {
2283 				*outlenp = 0;
2284 				return (ENOBUFS);
2285 			}
2286 			if (!checkonly) {
2287 				if (!IPCL_IS_NONSTR(connp)) {
2288 					connp->conn_wq->q_hiwat = *i1;
2289 				}
2290 				icmp->icmp_xmit_hiwat = *i1;
2291 			}
2292 			break;
2293 		case SO_RCVBUF:
2294 			if (*i1 > is->is_max_buf) {
2295 				*outlenp = 0;
2296 				return (ENOBUFS);
2297 			}
2298 			if (!checkonly) {
2299 				icmp->icmp_recv_hiwat = *i1;
2300 				rw_exit(&icmp->icmp_rwlock);
2301 				(void) proto_set_rx_hiwat(connp->conn_rq, connp,
2302 				    *i1);
2303 				rw_enter(&icmp->icmp_rwlock, RW_WRITER);
2304 			}
2305 			break;
2306 		case SO_DGRAM_ERRIND:
2307 			if (!checkonly)
2308 				icmp->icmp_dgram_errind = onoff;
2309 			break;
2310 		case SO_ALLZONES:
2311 			/*
2312 			 * "soft" error (negative)
2313 			 * option not handled at this level
2314 			 * Note: Do not modify *outlenp
2315 			 */
2316 			return (-EINVAL);
2317 		case SO_TIMESTAMP:
2318 			if (!checkonly) {
2319 				icmp->icmp_timestamp = onoff;
2320 			}
2321 			break;
2322 		case SO_MAC_EXEMPT:
2323 			/*
2324 			 * "soft" error (negative)
2325 			 * option not handled at this level
2326 			 * Note: Do not modify *outlenp
2327 			 */
2328 			return (-EINVAL);
2329 		case SO_RCVTIMEO:
2330 		case SO_SNDTIMEO:
2331 			/*
2332 			 * Pass these two options in order for third part
2333 			 * protocol usage. Here just return directly.
2334 			 */
2335 			return (0);
2336 		/*
2337 		 * Following three not meaningful for icmp
2338 		 * Action is same as "default" so we keep them
2339 		 * in comments.
2340 		 * case SO_LINGER:
2341 		 * case SO_KEEPALIVE:
2342 		 * case SO_OOBINLINE:
2343 		 */
2344 		default:
2345 			*outlenp = 0;
2346 			return (EINVAL);
2347 		}
2348 		break;
2349 	case IPPROTO_IP:
2350 		/*
2351 		 * Only allow IPv4 option processing on IPv4 sockets.
2352 		 */
2353 		if (icmp->icmp_family != AF_INET) {
2354 			*outlenp = 0;
2355 			return (ENOPROTOOPT);
2356 		}
2357 		switch (name) {
2358 		case IP_OPTIONS:
2359 		case T_IP_OPTIONS:
2360 			/* Save options for use by IP. */
2361 			if ((inlen & 0x3) ||
2362 			    inlen + icmp->icmp_label_len > IP_MAX_OPT_LENGTH) {
2363 				*outlenp = 0;
2364 				return (EINVAL);
2365 			}
2366 			if (checkonly)
2367 				break;
2368 
2369 			if (!tsol_option_set(&icmp->icmp_ip_snd_options,
2370 			    &icmp->icmp_ip_snd_options_len,
2371 			    icmp->icmp_label_len, invalp, inlen)) {
2372 				*outlenp = 0;
2373 				return (ENOMEM);
2374 			}
2375 
2376 			icmp->icmp_max_hdr_len = IP_SIMPLE_HDR_LENGTH +
2377 			    icmp->icmp_ip_snd_options_len;
2378 			rw_exit(&icmp->icmp_rwlock);
2379 			(void) proto_set_tx_wroff(connp->conn_rq == NULL ? NULL:
2380 			    RD(connp->conn_rq), connp,
2381 			    icmp->icmp_max_hdr_len + is->is_wroff_extra);
2382 			rw_enter(&icmp->icmp_rwlock, RW_WRITER);
2383 			break;
2384 		case IP_HDRINCL:
2385 			if (!checkonly)
2386 				icmp->icmp_hdrincl = onoff;
2387 			break;
2388 		case IP_TOS:
2389 		case T_IP_TOS:
2390 			if (!checkonly) {
2391 				icmp->icmp_type_of_service = (uint8_t)*i1;
2392 			}
2393 			break;
2394 		case IP_TTL:
2395 			if (!checkonly) {
2396 				icmp->icmp_ttl = (uint8_t)*i1;
2397 			}
2398 			break;
2399 		case IP_MULTICAST_IF:
2400 			/*
2401 			 * TODO should check OPTMGMT reply and undo this if
2402 			 * there is an error.
2403 			 */
2404 			if (!checkonly) {
2405 				icmp->icmp_multicast_if_addr = *i1;
2406 				PASS_OPT_TO_IP(connp);
2407 			}
2408 			break;
2409 		case IP_MULTICAST_TTL:
2410 			if (!checkonly)
2411 				icmp->icmp_multicast_ttl = *invalp;
2412 			break;
2413 		case IP_MULTICAST_LOOP:
2414 			if (!checkonly) {
2415 				connp->conn_multicast_loop =
2416 				    (*invalp == 0) ? 0 : 1;
2417 				PASS_OPT_TO_IP(connp);
2418 			}
2419 			break;
2420 		case IP_BOUND_IF:
2421 			if (!checkonly) {
2422 				icmp->icmp_bound_if = *i1;
2423 				PASS_OPT_TO_IP(connp);
2424 			}
2425 			break;
2426 		case IP_UNSPEC_SRC:
2427 			if (!checkonly) {
2428 				icmp->icmp_unspec_source = onoff;
2429 				PASS_OPT_TO_IP(connp);
2430 			}
2431 			break;
2432 		case IP_BROADCAST_TTL:
2433 			if (!checkonly)
2434 				connp->conn_broadcast_ttl = *invalp;
2435 			break;
2436 		case IP_RECVIF:
2437 			if (!checkonly) {
2438 				icmp->icmp_recvif = onoff;
2439 			}
2440 			/*
2441 			 * pass to ip
2442 			 */
2443 			return (-EINVAL);
2444 		case IP_PKTINFO: {
2445 			/*
2446 			 * This also handles IP_RECVPKTINFO.
2447 			 * IP_PKTINFO and IP_RECVPKTINFO have the same value.
2448 			 * Differentiation is based on the size of the argument
2449 			 * passed in.
2450 			 */
2451 			struct in_pktinfo *pktinfop;
2452 			ip4_pkt_t *attr_pktinfop;
2453 
2454 			if (checkonly)
2455 				break;
2456 
2457 			if (inlen == sizeof (int)) {
2458 				/*
2459 				 * This is IP_RECVPKTINFO option.
2460 				 * Keep a local copy of wether this option is
2461 				 * set or not and pass it down to IP for
2462 				 * processing.
2463 				 */
2464 				icmp->icmp_ip_recvpktinfo = onoff;
2465 				return (-EINVAL);
2466 			}
2467 
2468 
2469 			if (inlen != sizeof (struct in_pktinfo)) {
2470 				return (EINVAL);
2471 			}
2472 
2473 			if ((attr_pktinfop = (ip4_pkt_t *)thisdg_attrs)
2474 			    == NULL) {
2475 				/*
2476 				 * sticky option is not supported
2477 				 */
2478 				return (EINVAL);
2479 			}
2480 
2481 			pktinfop = (struct in_pktinfo *)invalp;
2482 
2483 			/*
2484 			 * Atleast one of the values should be specified
2485 			 */
2486 			if (pktinfop->ipi_ifindex == 0 &&
2487 			    pktinfop->ipi_spec_dst.s_addr == INADDR_ANY) {
2488 				return (EINVAL);
2489 			}
2490 
2491 			attr_pktinfop->ip4_addr = pktinfop->ipi_spec_dst.s_addr;
2492 			attr_pktinfop->ip4_ill_index = pktinfop->ipi_ifindex;
2493 		}
2494 			break;
2495 		case IP_ADD_MEMBERSHIP:
2496 		case IP_DROP_MEMBERSHIP:
2497 		case IP_BLOCK_SOURCE:
2498 		case IP_UNBLOCK_SOURCE:
2499 		case IP_ADD_SOURCE_MEMBERSHIP:
2500 		case IP_DROP_SOURCE_MEMBERSHIP:
2501 		case MCAST_JOIN_GROUP:
2502 		case MCAST_LEAVE_GROUP:
2503 		case MCAST_BLOCK_SOURCE:
2504 		case MCAST_UNBLOCK_SOURCE:
2505 		case MCAST_JOIN_SOURCE_GROUP:
2506 		case MCAST_LEAVE_SOURCE_GROUP:
2507 		case MRT_INIT:
2508 		case MRT_DONE:
2509 		case MRT_ADD_VIF:
2510 		case MRT_DEL_VIF:
2511 		case MRT_ADD_MFC:
2512 		case MRT_DEL_MFC:
2513 		case MRT_VERSION:
2514 		case MRT_ASSERT:
2515 		case IP_SEC_OPT:
2516 		case IP_NEXTHOP:
2517 			/*
2518 			 * "soft" error (negative)
2519 			 * option not handled at this level
2520 			 * Note: Do not modify *outlenp
2521 			 */
2522 			return (-EINVAL);
2523 		default:
2524 			*outlenp = 0;
2525 			return (EINVAL);
2526 		}
2527 		break;
2528 	case IPPROTO_IPV6: {
2529 		ip6_pkt_t		*ipp;
2530 		boolean_t		sticky;
2531 
2532 		if (icmp->icmp_family != AF_INET6) {
2533 			*outlenp = 0;
2534 			return (ENOPROTOOPT);
2535 		}
2536 		/*
2537 		 * Deal with both sticky options and ancillary data
2538 		 */
2539 		if (thisdg_attrs == NULL) {
2540 			/* sticky options, or none */
2541 			ipp = &icmp->icmp_sticky_ipp;
2542 			sticky = B_TRUE;
2543 		} else {
2544 			/* ancillary data */
2545 			ipp = (ip6_pkt_t *)thisdg_attrs;
2546 			sticky = B_FALSE;
2547 		}
2548 
2549 		switch (name) {
2550 		case IPV6_MULTICAST_IF:
2551 			if (!checkonly) {
2552 				icmp->icmp_multicast_if_index = *i1;
2553 				PASS_OPT_TO_IP(connp);
2554 			}
2555 			break;
2556 		case IPV6_UNICAST_HOPS:
2557 			/* -1 means use default */
2558 			if (*i1 < -1 || *i1 > IPV6_MAX_HOPS) {
2559 				*outlenp = 0;
2560 				return (EINVAL);
2561 			}
2562 			if (!checkonly) {
2563 				if (*i1 == -1) {
2564 					icmp->icmp_ttl = ipp->ipp_unicast_hops =
2565 					    is->is_ipv6_hoplimit;
2566 					ipp->ipp_fields &= ~IPPF_UNICAST_HOPS;
2567 					/* Pass modified value to IP. */
2568 					*i1 = ipp->ipp_hoplimit;
2569 				} else {
2570 					icmp->icmp_ttl = ipp->ipp_unicast_hops =
2571 					    (uint8_t)*i1;
2572 					ipp->ipp_fields |= IPPF_UNICAST_HOPS;
2573 				}
2574 				/* Rebuild the header template */
2575 				error = icmp_build_hdrs(icmp);
2576 				if (error != 0) {
2577 					*outlenp = 0;
2578 					return (error);
2579 				}
2580 			}
2581 			break;
2582 		case IPV6_MULTICAST_HOPS:
2583 			/* -1 means use default */
2584 			if (*i1 < -1 || *i1 > IPV6_MAX_HOPS) {
2585 				*outlenp = 0;
2586 				return (EINVAL);
2587 			}
2588 			if (!checkonly) {
2589 				if (*i1 == -1) {
2590 					icmp->icmp_multicast_ttl =
2591 					    ipp->ipp_multicast_hops =
2592 					    IP_DEFAULT_MULTICAST_TTL;
2593 					ipp->ipp_fields &= ~IPPF_MULTICAST_HOPS;
2594 					/* Pass modified value to IP. */
2595 					*i1 = icmp->icmp_multicast_ttl;
2596 				} else {
2597 					icmp->icmp_multicast_ttl =
2598 					    ipp->ipp_multicast_hops =
2599 					    (uint8_t)*i1;
2600 					ipp->ipp_fields |= IPPF_MULTICAST_HOPS;
2601 				}
2602 			}
2603 			break;
2604 		case IPV6_MULTICAST_LOOP:
2605 			if (*i1 != 0 && *i1 != 1) {
2606 				*outlenp = 0;
2607 				return (EINVAL);
2608 			}
2609 			if (!checkonly) {
2610 				connp->conn_multicast_loop = *i1;
2611 				PASS_OPT_TO_IP(connp);
2612 			}
2613 			break;
2614 		case IPV6_CHECKSUM:
2615 			/*
2616 			 * Integer offset into the user data of where the
2617 			 * checksum is located.
2618 			 * Offset of -1 disables option.
2619 			 * Does not apply to IPPROTO_ICMPV6.
2620 			 */
2621 			if (icmp->icmp_proto == IPPROTO_ICMPV6 || !sticky) {
2622 				*outlenp = 0;
2623 				return (EINVAL);
2624 			}
2625 			if ((*i1 != -1) && ((*i1 < 0) || (*i1 & 0x1) != 0)) {
2626 				/* Negative or not 16 bit aligned offset */
2627 				*outlenp = 0;
2628 				return (EINVAL);
2629 			}
2630 			if (checkonly)
2631 				break;
2632 
2633 			if (*i1 == -1) {
2634 				icmp->icmp_raw_checksum = 0;
2635 				ipp->ipp_fields &= ~IPPF_RAW_CKSUM;
2636 			} else {
2637 				icmp->icmp_raw_checksum = 1;
2638 				icmp->icmp_checksum_off = *i1;
2639 				ipp->ipp_fields |= IPPF_RAW_CKSUM;
2640 			}
2641 			/* Rebuild the header template */
2642 			error = icmp_build_hdrs(icmp);
2643 			if (error != 0) {
2644 				*outlenp = 0;
2645 				return (error);
2646 			}
2647 			break;
2648 		case IPV6_JOIN_GROUP:
2649 		case IPV6_LEAVE_GROUP:
2650 		case MCAST_JOIN_GROUP:
2651 		case MCAST_LEAVE_GROUP:
2652 		case MCAST_BLOCK_SOURCE:
2653 		case MCAST_UNBLOCK_SOURCE:
2654 		case MCAST_JOIN_SOURCE_GROUP:
2655 		case MCAST_LEAVE_SOURCE_GROUP:
2656 			/*
2657 			 * "soft" error (negative)
2658 			 * option not handled at this level
2659 			 * Note: Do not modify *outlenp
2660 			 */
2661 			return (-EINVAL);
2662 		case IPV6_BOUND_IF:
2663 			if (!checkonly) {
2664 				icmp->icmp_bound_if = *i1;
2665 				PASS_OPT_TO_IP(connp);
2666 			}
2667 			break;
2668 		case IPV6_UNSPEC_SRC:
2669 			if (!checkonly) {
2670 				icmp->icmp_unspec_source = onoff;
2671 				PASS_OPT_TO_IP(connp);
2672 			}
2673 			break;
2674 		case IPV6_RECVTCLASS:
2675 			if (!checkonly) {
2676 				icmp->icmp_ipv6_recvtclass = onoff;
2677 				PASS_OPT_TO_IP(connp);
2678 			}
2679 			break;
2680 		/*
2681 		 * Set boolean switches for ancillary data delivery
2682 		 */
2683 		case IPV6_RECVPKTINFO:
2684 			if (!checkonly) {
2685 				icmp->icmp_ip_recvpktinfo = onoff;
2686 				PASS_OPT_TO_IP(connp);
2687 			}
2688 			break;
2689 		case IPV6_RECVPATHMTU:
2690 			if (!checkonly) {
2691 				icmp->icmp_ipv6_recvpathmtu = onoff;
2692 				PASS_OPT_TO_IP(connp);
2693 			}
2694 			break;
2695 		case IPV6_RECVHOPLIMIT:
2696 			if (!checkonly) {
2697 				icmp->icmp_ipv6_recvhoplimit = onoff;
2698 				PASS_OPT_TO_IP(connp);
2699 			}
2700 			break;
2701 		case IPV6_RECVHOPOPTS:
2702 			if (!checkonly) {
2703 				icmp->icmp_ipv6_recvhopopts = onoff;
2704 				PASS_OPT_TO_IP(connp);
2705 			}
2706 			break;
2707 		case IPV6_RECVDSTOPTS:
2708 			if (!checkonly) {
2709 				icmp->icmp_ipv6_recvdstopts = onoff;
2710 				PASS_OPT_TO_IP(connp);
2711 			}
2712 			break;
2713 		case _OLD_IPV6_RECVDSTOPTS:
2714 			if (!checkonly)
2715 				icmp->icmp_old_ipv6_recvdstopts = onoff;
2716 			break;
2717 		case IPV6_RECVRTHDRDSTOPTS:
2718 			if (!checkonly) {
2719 				icmp->icmp_ipv6_recvrtdstopts = onoff;
2720 				PASS_OPT_TO_IP(connp);
2721 			}
2722 			break;
2723 		case IPV6_RECVRTHDR:
2724 			if (!checkonly) {
2725 				icmp->icmp_ipv6_recvrthdr = onoff;
2726 				PASS_OPT_TO_IP(connp);
2727 			}
2728 			break;
2729 		/*
2730 		 * Set sticky options or ancillary data.
2731 		 * If sticky options, (re)build any extension headers
2732 		 * that might be needed as a result.
2733 		 */
2734 		case IPV6_PKTINFO:
2735 			/*
2736 			 * The source address and ifindex are verified
2737 			 * in ip_opt_set(). For ancillary data the
2738 			 * source address is checked in ip_wput_v6.
2739 			 */
2740 			if (inlen != 0 && inlen !=
2741 			    sizeof (struct in6_pktinfo)) {
2742 				return (EINVAL);
2743 			}
2744 			if (checkonly)
2745 				break;
2746 
2747 			if (inlen == 0) {
2748 				ipp->ipp_fields &= ~(IPPF_IFINDEX|IPPF_ADDR);
2749 				ipp->ipp_sticky_ignored |=
2750 				    (IPPF_IFINDEX|IPPF_ADDR);
2751 			} else {
2752 				struct in6_pktinfo *pkti;
2753 
2754 				pkti = (struct in6_pktinfo *)invalp;
2755 				ipp->ipp_ifindex = pkti->ipi6_ifindex;
2756 				ipp->ipp_addr = pkti->ipi6_addr;
2757 				if (ipp->ipp_ifindex != 0)
2758 					ipp->ipp_fields |= IPPF_IFINDEX;
2759 				else
2760 					ipp->ipp_fields &= ~IPPF_IFINDEX;
2761 				if (!IN6_IS_ADDR_UNSPECIFIED(
2762 				    &ipp->ipp_addr))
2763 					ipp->ipp_fields |= IPPF_ADDR;
2764 				else
2765 					ipp->ipp_fields &= ~IPPF_ADDR;
2766 			}
2767 			if (sticky) {
2768 				error = icmp_build_hdrs(icmp);
2769 				if (error != 0)
2770 					return (error);
2771 				PASS_OPT_TO_IP(connp);
2772 			}
2773 			break;
2774 		case IPV6_HOPLIMIT:
2775 			/* This option can only be used as ancillary data. */
2776 			if (sticky)
2777 				return (EINVAL);
2778 			if (inlen != 0 && inlen != sizeof (int))
2779 				return (EINVAL);
2780 			if (checkonly)
2781 				break;
2782 
2783 			if (inlen == 0) {
2784 				ipp->ipp_fields &= ~IPPF_HOPLIMIT;
2785 				ipp->ipp_sticky_ignored |= IPPF_HOPLIMIT;
2786 			} else {
2787 				if (*i1 > 255 || *i1 < -1)
2788 					return (EINVAL);
2789 				if (*i1 == -1)
2790 					ipp->ipp_hoplimit =
2791 					    is->is_ipv6_hoplimit;
2792 				else
2793 					ipp->ipp_hoplimit = *i1;
2794 				ipp->ipp_fields |= IPPF_HOPLIMIT;
2795 			}
2796 			break;
2797 		case IPV6_TCLASS:
2798 			/*
2799 			 * IPV6_RECVTCLASS accepts -1 as use kernel default
2800 			 * and [0, 255] as the actualy traffic class.
2801 			 */
2802 			if (inlen != 0 && inlen != sizeof (int)) {
2803 				return (EINVAL);
2804 			}
2805 			if (checkonly)
2806 				break;
2807 
2808 			if (inlen == 0) {
2809 				ipp->ipp_fields &= ~IPPF_TCLASS;
2810 				ipp->ipp_sticky_ignored |= IPPF_TCLASS;
2811 			} else {
2812 				if (*i1 >= 256 || *i1 < -1)
2813 					return (EINVAL);
2814 				if (*i1 == -1) {
2815 					ipp->ipp_tclass =
2816 					    IPV6_FLOW_TCLASS(
2817 					    IPV6_DEFAULT_VERS_AND_FLOW);
2818 				} else {
2819 					ipp->ipp_tclass = *i1;
2820 				}
2821 				ipp->ipp_fields |= IPPF_TCLASS;
2822 			}
2823 			if (sticky) {
2824 				error = icmp_build_hdrs(icmp);
2825 				if (error != 0)
2826 					return (error);
2827 			}
2828 			break;
2829 		case IPV6_NEXTHOP:
2830 			/*
2831 			 * IP will verify that the nexthop is reachable
2832 			 * and fail for sticky options.
2833 			 */
2834 			if (inlen != 0 && inlen != sizeof (sin6_t)) {
2835 				return (EINVAL);
2836 			}
2837 			if (checkonly)
2838 				break;
2839 
2840 			if (inlen == 0) {
2841 				ipp->ipp_fields &= ~IPPF_NEXTHOP;
2842 				ipp->ipp_sticky_ignored |= IPPF_NEXTHOP;
2843 			} else {
2844 				sin6_t *sin6 = (sin6_t *)invalp;
2845 
2846 				if (sin6->sin6_family != AF_INET6) {
2847 					return (EAFNOSUPPORT);
2848 				}
2849 				if (IN6_IS_ADDR_V4MAPPED(&sin6->sin6_addr)) {
2850 					return (EADDRNOTAVAIL);
2851 				}
2852 				ipp->ipp_nexthop = sin6->sin6_addr;
2853 				if (!IN6_IS_ADDR_UNSPECIFIED(
2854 				    &ipp->ipp_nexthop))
2855 					ipp->ipp_fields |= IPPF_NEXTHOP;
2856 				else
2857 					ipp->ipp_fields &= ~IPPF_NEXTHOP;
2858 			}
2859 			if (sticky) {
2860 				error = icmp_build_hdrs(icmp);
2861 				if (error != 0)
2862 					return (error);
2863 				PASS_OPT_TO_IP(connp);
2864 			}
2865 			break;
2866 		case IPV6_HOPOPTS: {
2867 			ip6_hbh_t *hopts = (ip6_hbh_t *)invalp;
2868 			/*
2869 			 * Sanity checks - minimum size, size a multiple of
2870 			 * eight bytes, and matching size passed in.
2871 			 */
2872 			if (inlen != 0 &&
2873 			    inlen != (8 * (hopts->ip6h_len + 1))) {
2874 				return (EINVAL);
2875 			}
2876 
2877 			if (checkonly)
2878 				break;
2879 			error = optcom_pkt_set(invalp, inlen, sticky,
2880 			    (uchar_t **)&ipp->ipp_hopopts,
2881 			    &ipp->ipp_hopoptslen,
2882 			    sticky ? icmp->icmp_label_len_v6 : 0);
2883 			if (error != 0)
2884 				return (error);
2885 			if (ipp->ipp_hopoptslen == 0) {
2886 				ipp->ipp_fields &= ~IPPF_HOPOPTS;
2887 				ipp->ipp_sticky_ignored |= IPPF_HOPOPTS;
2888 			} else {
2889 				ipp->ipp_fields |= IPPF_HOPOPTS;
2890 			}
2891 			if (sticky) {
2892 				error = icmp_build_hdrs(icmp);
2893 				if (error != 0)
2894 					return (error);
2895 			}
2896 			break;
2897 		}
2898 		case IPV6_RTHDRDSTOPTS: {
2899 			ip6_dest_t *dopts = (ip6_dest_t *)invalp;
2900 
2901 			/*
2902 			 * Sanity checks - minimum size, size a multiple of
2903 			 * eight bytes, and matching size passed in.
2904 			 */
2905 			if (inlen != 0 &&
2906 			    inlen != (8 * (dopts->ip6d_len + 1)))
2907 				return (EINVAL);
2908 
2909 			if (checkonly)
2910 				break;
2911 
2912 			if (inlen == 0) {
2913 				if (sticky &&
2914 				    (ipp->ipp_fields & IPPF_RTDSTOPTS) != 0) {
2915 					kmem_free(ipp->ipp_rtdstopts,
2916 					    ipp->ipp_rtdstoptslen);
2917 					ipp->ipp_rtdstopts = NULL;
2918 					ipp->ipp_rtdstoptslen = 0;
2919 				}
2920 				ipp->ipp_fields &= ~IPPF_RTDSTOPTS;
2921 				ipp->ipp_sticky_ignored |= IPPF_RTDSTOPTS;
2922 			} else {
2923 				error = optcom_pkt_set(invalp, inlen, sticky,
2924 				    (uchar_t **)&ipp->ipp_rtdstopts,
2925 				    &ipp->ipp_rtdstoptslen, 0);
2926 				if (error != 0)
2927 					return (error);
2928 				ipp->ipp_fields |= IPPF_RTDSTOPTS;
2929 			}
2930 			if (sticky) {
2931 				error = icmp_build_hdrs(icmp);
2932 				if (error != 0)
2933 					return (error);
2934 			}
2935 			break;
2936 		}
2937 		case IPV6_DSTOPTS: {
2938 			ip6_dest_t *dopts = (ip6_dest_t *)invalp;
2939 
2940 			/*
2941 			 * Sanity checks - minimum size, size a multiple of
2942 			 * eight bytes, and matching size passed in.
2943 			 */
2944 			if (inlen != 0 &&
2945 			    inlen != (8 * (dopts->ip6d_len + 1)))
2946 				return (EINVAL);
2947 
2948 			if (checkonly)
2949 				break;
2950 
2951 			if (inlen == 0) {
2952 				if (sticky &&
2953 				    (ipp->ipp_fields & IPPF_DSTOPTS) != 0) {
2954 					kmem_free(ipp->ipp_dstopts,
2955 					    ipp->ipp_dstoptslen);
2956 					ipp->ipp_dstopts = NULL;
2957 					ipp->ipp_dstoptslen = 0;
2958 				}
2959 				ipp->ipp_fields &= ~IPPF_DSTOPTS;
2960 				ipp->ipp_sticky_ignored |= IPPF_DSTOPTS;
2961 			} else {
2962 				error = optcom_pkt_set(invalp, inlen, sticky,
2963 				    (uchar_t **)&ipp->ipp_dstopts,
2964 				    &ipp->ipp_dstoptslen, 0);
2965 				if (error != 0)
2966 					return (error);
2967 				ipp->ipp_fields |= IPPF_DSTOPTS;
2968 			}
2969 			if (sticky) {
2970 				error = icmp_build_hdrs(icmp);
2971 				if (error != 0)
2972 					return (error);
2973 			}
2974 			break;
2975 		}
2976 		case IPV6_RTHDR: {
2977 			ip6_rthdr_t *rt = (ip6_rthdr_t *)invalp;
2978 
2979 			/*
2980 			 * Sanity checks - minimum size, size a multiple of
2981 			 * eight bytes, and matching size passed in.
2982 			 */
2983 			if (inlen != 0 &&
2984 			    inlen != (8 * (rt->ip6r_len + 1)))
2985 				return (EINVAL);
2986 
2987 			if (checkonly)
2988 				break;
2989 
2990 			if (inlen == 0) {
2991 				if (sticky &&
2992 				    (ipp->ipp_fields & IPPF_RTHDR) != 0) {
2993 					kmem_free(ipp->ipp_rthdr,
2994 					    ipp->ipp_rthdrlen);
2995 					ipp->ipp_rthdr = NULL;
2996 					ipp->ipp_rthdrlen = 0;
2997 				}
2998 				ipp->ipp_fields &= ~IPPF_RTHDR;
2999 				ipp->ipp_sticky_ignored |= IPPF_RTHDR;
3000 			} else {
3001 				error = optcom_pkt_set(invalp, inlen, sticky,
3002 				    (uchar_t **)&ipp->ipp_rthdr,
3003 				    &ipp->ipp_rthdrlen, 0);
3004 				if (error != 0)
3005 					return (error);
3006 				ipp->ipp_fields |= IPPF_RTHDR;
3007 			}
3008 			if (sticky) {
3009 				error = icmp_build_hdrs(icmp);
3010 				if (error != 0)
3011 					return (error);
3012 			}
3013 			break;
3014 		}
3015 
3016 		case IPV6_DONTFRAG:
3017 			if (checkonly)
3018 				break;
3019 
3020 			if (onoff) {
3021 				ipp->ipp_fields |= IPPF_DONTFRAG;
3022 			} else {
3023 				ipp->ipp_fields &= ~IPPF_DONTFRAG;
3024 			}
3025 			break;
3026 
3027 		case IPV6_USE_MIN_MTU:
3028 			if (inlen != sizeof (int))
3029 				return (EINVAL);
3030 
3031 			if (*i1 < -1 || *i1 > 1)
3032 				return (EINVAL);
3033 
3034 			if (checkonly)
3035 				break;
3036 
3037 			ipp->ipp_fields |= IPPF_USE_MIN_MTU;
3038 			ipp->ipp_use_min_mtu = *i1;
3039 			break;
3040 
3041 		/*
3042 		 * This option can't be set.  Its only returned via
3043 		 * getsockopt() or ancillary data.
3044 		 */
3045 		case IPV6_PATHMTU:
3046 			return (EINVAL);
3047 
3048 		case IPV6_SEC_OPT:
3049 		case IPV6_SRC_PREFERENCES:
3050 		case IPV6_V6ONLY:
3051 			/* Handled at IP level */
3052 			return (-EINVAL);
3053 		default:
3054 			*outlenp = 0;
3055 			return (EINVAL);
3056 		}
3057 		break;
3058 	}		/* end IPPROTO_IPV6 */
3059 
3060 	case IPPROTO_ICMPV6:
3061 		/*
3062 		 * Only allow IPv6 option processing on IPv6 sockets.
3063 		 */
3064 		if (icmp->icmp_family != AF_INET6) {
3065 			*outlenp = 0;
3066 			return (ENOPROTOOPT);
3067 		}
3068 		if (icmp->icmp_proto != IPPROTO_ICMPV6) {
3069 			*outlenp = 0;
3070 			return (ENOPROTOOPT);
3071 		}
3072 		switch (name) {
3073 		case ICMP6_FILTER:
3074 			if (!checkonly) {
3075 				if ((inlen != 0) &&
3076 				    (inlen != sizeof (icmp6_filter_t)))
3077 					return (EINVAL);
3078 
3079 				if (inlen == 0) {
3080 					if (icmp->icmp_filter != NULL) {
3081 						kmem_free(icmp->icmp_filter,
3082 						    sizeof (icmp6_filter_t));
3083 						icmp->icmp_filter = NULL;
3084 					}
3085 				} else {
3086 					if (icmp->icmp_filter == NULL) {
3087 						icmp->icmp_filter = kmem_alloc(
3088 						    sizeof (icmp6_filter_t),
3089 						    KM_NOSLEEP);
3090 						if (icmp->icmp_filter == NULL) {
3091 							*outlenp = 0;
3092 							return (ENOBUFS);
3093 						}
3094 					}
3095 					(void) bcopy(invalp, icmp->icmp_filter,
3096 					    inlen);
3097 				}
3098 			}
3099 			break;
3100 
3101 		default:
3102 			*outlenp = 0;
3103 			return (EINVAL);
3104 		}
3105 		break;
3106 	default:
3107 		*outlenp = 0;
3108 		return (EINVAL);
3109 	}
3110 	/*
3111 	 * Common case of OK return with outval same as inval.
3112 	 */
3113 	if (invalp != outvalp) {
3114 		/* don't trust bcopy for identical src/dst */
3115 		(void) bcopy(invalp, outvalp, inlen);
3116 	}
3117 	*outlenp = inlen;
3118 	return (0);
3119 }
3120 
3121 /* This routine sets socket options. */
3122 /* ARGSUSED */
3123 int
3124 icmp_opt_set(conn_t *connp, uint_t optset_context, int level, int name,
3125     uint_t inlen, uchar_t *invalp, uint_t *outlenp, uchar_t *outvalp,
3126     void *thisdg_attrs, cred_t *cr)
3127 {
3128 	boolean_t checkonly;
3129 	int	error;
3130 
3131 	error = 0;
3132 	switch (optset_context) {
3133 	case SETFN_OPTCOM_CHECKONLY:
3134 		checkonly = B_TRUE;
3135 		/*
3136 		 * Note: Implies T_CHECK semantics for T_OPTCOM_REQ
3137 		 * inlen != 0 implies value supplied and
3138 		 * 	we have to "pretend" to set it.
3139 		 * inlen == 0 implies that there is no
3140 		 * 	value part in T_CHECK request and just validation
3141 		 * done elsewhere should be enough, we just return here.
3142 		 */
3143 		if (inlen == 0) {
3144 			*outlenp = 0;
3145 			error = 0;
3146 			goto done;
3147 		}
3148 		break;
3149 	case SETFN_OPTCOM_NEGOTIATE:
3150 		checkonly = B_FALSE;
3151 		break;
3152 	case SETFN_UD_NEGOTIATE:
3153 	case SETFN_CONN_NEGOTIATE:
3154 		checkonly = B_FALSE;
3155 		/*
3156 		 * Negotiating local and "association-related" options
3157 		 * through T_UNITDATA_REQ.
3158 		 *
3159 		 * Following routine can filter out ones we do not
3160 		 * want to be "set" this way.
3161 		 */
3162 		if (!icmp_opt_allow_udr_set(level, name)) {
3163 			*outlenp = 0;
3164 			error = EINVAL;
3165 			goto done;
3166 		}
3167 		break;
3168 	default:
3169 		/*
3170 		 * We should never get here
3171 		 */
3172 		*outlenp = 0;
3173 		error = EINVAL;
3174 		goto done;
3175 	}
3176 
3177 	ASSERT((optset_context != SETFN_OPTCOM_CHECKONLY) ||
3178 	    (optset_context == SETFN_OPTCOM_CHECKONLY && inlen != 0));
3179 	error = icmp_do_opt_set(connp, level, name, inlen, invalp, outlenp,
3180 	    outvalp, cr, thisdg_attrs, checkonly);
3181 
3182 done:
3183 	return (error);
3184 }
3185 
3186 /* This routine sets socket options. */
3187 /* ARGSUSED */
3188 int
3189 icmp_tpi_opt_set(queue_t *q, uint_t optset_context, int level, int name,
3190     uint_t inlen, uchar_t *invalp, uint_t *outlenp, uchar_t *outvalp,
3191     void *thisdg_attrs, cred_t *cr, mblk_t *mblk)
3192 {
3193 	conn_t	*connp =  Q_TO_CONN(q);
3194 	icmp_t	*icmp;
3195 	int error;
3196 
3197 	icmp = connp->conn_icmp;
3198 	rw_enter(&icmp->icmp_rwlock, RW_WRITER);
3199 	error = icmp_opt_set(connp, optset_context, level, name, inlen, invalp,
3200 	    outlenp, outvalp, thisdg_attrs, cr);
3201 	rw_exit(&icmp->icmp_rwlock);
3202 	return (error);
3203 }
3204 
3205 /*
3206  * Update icmp_sticky_hdrs based on icmp_sticky_ipp, icmp_v6src, icmp_ttl,
3207  * icmp_proto, icmp_raw_checksum and icmp_no_tp_cksum.
3208  * The headers include ip6i_t (if needed), ip6_t, and any sticky extension
3209  * headers.
3210  * Returns failure if can't allocate memory.
3211  */
3212 static int
3213 icmp_build_hdrs(icmp_t *icmp)
3214 {
3215 	icmp_stack_t *is = icmp->icmp_is;
3216 	uchar_t	*hdrs;
3217 	uint_t	hdrs_len;
3218 	ip6_t	*ip6h;
3219 	ip6i_t	*ip6i;
3220 	ip6_pkt_t *ipp = &icmp->icmp_sticky_ipp;
3221 
3222 	ASSERT(RW_WRITE_HELD(&icmp->icmp_rwlock));
3223 	hdrs_len = ip_total_hdrs_len_v6(ipp);
3224 	ASSERT(hdrs_len != 0);
3225 	if (hdrs_len != icmp->icmp_sticky_hdrs_len) {
3226 		/* Need to reallocate */
3227 		if (hdrs_len != 0) {
3228 			hdrs = kmem_alloc(hdrs_len, KM_NOSLEEP);
3229 			if (hdrs == NULL)
3230 				return (ENOMEM);
3231 		} else {
3232 			hdrs = NULL;
3233 		}
3234 		if (icmp->icmp_sticky_hdrs_len != 0) {
3235 			kmem_free(icmp->icmp_sticky_hdrs,
3236 			    icmp->icmp_sticky_hdrs_len);
3237 		}
3238 		icmp->icmp_sticky_hdrs = hdrs;
3239 		icmp->icmp_sticky_hdrs_len = hdrs_len;
3240 	}
3241 	ip_build_hdrs_v6(icmp->icmp_sticky_hdrs,
3242 	    icmp->icmp_sticky_hdrs_len, ipp, icmp->icmp_proto);
3243 
3244 	/* Set header fields not in ipp */
3245 	if (ipp->ipp_fields & IPPF_HAS_IP6I) {
3246 		ip6i = (ip6i_t *)icmp->icmp_sticky_hdrs;
3247 		ip6h = (ip6_t *)&ip6i[1];
3248 
3249 		if (ipp->ipp_fields & IPPF_RAW_CKSUM) {
3250 			ip6i->ip6i_flags |= IP6I_RAW_CHECKSUM;
3251 			ip6i->ip6i_checksum_off = icmp->icmp_checksum_off;
3252 		}
3253 		if (ipp->ipp_fields & IPPF_NO_CKSUM) {
3254 			ip6i->ip6i_flags |= IP6I_NO_ULP_CKSUM;
3255 		}
3256 	} else {
3257 		ip6h = (ip6_t *)icmp->icmp_sticky_hdrs;
3258 	}
3259 
3260 	if (!(ipp->ipp_fields & IPPF_ADDR))
3261 		ip6h->ip6_src = icmp->icmp_v6src;
3262 
3263 	/* Try to get everything in a single mblk */
3264 	if (hdrs_len > icmp->icmp_max_hdr_len) {
3265 		icmp->icmp_max_hdr_len = hdrs_len;
3266 		rw_exit(&icmp->icmp_rwlock);
3267 		(void) proto_set_tx_wroff(icmp->icmp_connp->conn_rq,
3268 		    icmp->icmp_connp,
3269 		    icmp->icmp_max_hdr_len + is->is_wroff_extra);
3270 		rw_enter(&icmp->icmp_rwlock, RW_WRITER);
3271 	}
3272 	return (0);
3273 }
3274 
3275 /*
3276  * This routine retrieves the value of an ND variable in a icmpparam_t
3277  * structure.  It is called through nd_getset when a user reads the
3278  * variable.
3279  */
3280 /* ARGSUSED */
3281 static int
3282 icmp_param_get(queue_t *q, mblk_t *mp, caddr_t cp, cred_t *cr)
3283 {
3284 	icmpparam_t	*icmppa = (icmpparam_t *)cp;
3285 
3286 	(void) mi_mpprintf(mp, "%d", icmppa->icmp_param_value);
3287 	return (0);
3288 }
3289 
3290 /*
3291  * Walk through the param array specified registering each element with the
3292  * named dispatch (ND) handler.
3293  */
3294 static boolean_t
3295 icmp_param_register(IDP *ndp, icmpparam_t *icmppa, int cnt)
3296 {
3297 	for (; cnt-- > 0; icmppa++) {
3298 		if (icmppa->icmp_param_name && icmppa->icmp_param_name[0]) {
3299 			if (!nd_load(ndp, icmppa->icmp_param_name,
3300 			    icmp_param_get, icmp_param_set,
3301 			    (caddr_t)icmppa)) {
3302 				nd_free(ndp);
3303 				return (B_FALSE);
3304 			}
3305 		}
3306 	}
3307 	return (B_TRUE);
3308 }
3309 
3310 /* This routine sets an ND variable in a icmpparam_t structure. */
3311 /* ARGSUSED */
3312 static int
3313 icmp_param_set(queue_t *q, mblk_t *mp, char *value, caddr_t cp, cred_t *cr)
3314 {
3315 	long		new_value;
3316 	icmpparam_t	*icmppa = (icmpparam_t *)cp;
3317 
3318 	/*
3319 	 * Fail the request if the new value does not lie within the
3320 	 * required bounds.
3321 	 */
3322 	if (ddi_strtol(value, NULL, 10, &new_value) != 0 ||
3323 	    new_value < icmppa->icmp_param_min ||
3324 	    new_value > icmppa->icmp_param_max) {
3325 		return (EINVAL);
3326 	}
3327 	/* Set the new value */
3328 	icmppa->icmp_param_value = new_value;
3329 	return (0);
3330 }
3331 
3332 static mblk_t *
3333 icmp_queue_fallback(icmp_t *icmp, mblk_t *mp)
3334 {
3335 	ASSERT(MUTEX_HELD(&icmp->icmp_recv_lock));
3336 	if (IPCL_IS_NONSTR(icmp->icmp_connp)) {
3337 		/*
3338 		 * fallback has started but messages have not been moved yet
3339 		 */
3340 		if (icmp->icmp_fallback_queue_head == NULL) {
3341 			ASSERT(icmp->icmp_fallback_queue_tail == NULL);
3342 			icmp->icmp_fallback_queue_head = mp;
3343 			icmp->icmp_fallback_queue_tail = mp;
3344 		} else {
3345 			ASSERT(icmp->icmp_fallback_queue_tail != NULL);
3346 			icmp->icmp_fallback_queue_tail->b_next = mp;
3347 			icmp->icmp_fallback_queue_tail = mp;
3348 		}
3349 		return (NULL);
3350 	} else {
3351 		/*
3352 		 * Fallback completed, let the caller putnext() the mblk.
3353 		 */
3354 		return (mp);
3355 	}
3356 }
3357 
3358 /*
3359  * Deliver data to ULP. In case we have a socket, and it's falling back to
3360  * TPI, then we'll queue the mp for later processing.
3361  */
3362 static void
3363 icmp_ulp_recv(conn_t *connp, mblk_t *mp)
3364 {
3365 
3366 	if (IPCL_IS_NONSTR(connp)) {
3367 		icmp_t *icmp = connp->conn_icmp;
3368 		int error;
3369 
3370 		if ((*connp->conn_upcalls->su_recv)
3371 		    (connp->conn_upper_handle, mp, msgdsize(mp), 0, &error,
3372 		    NULL) < 0) {
3373 			mutex_enter(&icmp->icmp_recv_lock);
3374 			if (error == ENOSPC) {
3375 				/*
3376 				 * let's confirm while holding the lock
3377 				 */
3378 				if ((*connp->conn_upcalls->su_recv)
3379 				    (connp->conn_upper_handle, NULL, 0, 0,
3380 				    &error, NULL) < 0) {
3381 					ASSERT(error == ENOSPC);
3382 					if (error == ENOSPC) {
3383 						connp->conn_flow_cntrld =
3384 						    B_TRUE;
3385 					}
3386 				}
3387 				mutex_exit(&icmp->icmp_recv_lock);
3388 			} else {
3389 				ASSERT(error == EOPNOTSUPP);
3390 				mp = icmp_queue_fallback(icmp, mp);
3391 				mutex_exit(&icmp->icmp_recv_lock);
3392 				if (mp != NULL)
3393 					putnext(connp->conn_rq, mp);
3394 			}
3395 		}
3396 		ASSERT(MUTEX_NOT_HELD(&icmp->icmp_recv_lock));
3397 	} else {
3398 		putnext(connp->conn_rq, mp);
3399 	}
3400 }
3401 
3402 /*ARGSUSED2*/
3403 static void
3404 icmp_input(void *arg1, mblk_t *mp, void *arg2)
3405 {
3406 	conn_t *connp = (conn_t *)arg1;
3407 	struct T_unitdata_ind	*tudi;
3408 	uchar_t			*rptr;
3409 	icmp_t			*icmp;
3410 	icmp_stack_t		*is;
3411 	sin_t			*sin;
3412 	sin6_t			*sin6;
3413 	ip6_t			*ip6h;
3414 	ip6i_t			*ip6i;
3415 	mblk_t			*mp1;
3416 	int			hdr_len;
3417 	ipha_t			*ipha;
3418 	int			udi_size;	/* Size of T_unitdata_ind */
3419 	uint_t			ipvers;
3420 	ip6_pkt_t		ipp;
3421 	uint8_t			nexthdr;
3422 	ip_pktinfo_t		*pinfo = NULL;
3423 	mblk_t			*options_mp = NULL;
3424 	uint_t			icmp_opt = 0;
3425 	boolean_t		icmp_ipv6_recvhoplimit = B_FALSE;
3426 	uint_t			hopstrip;
3427 
3428 	ASSERT(connp->conn_flags & IPCL_RAWIPCONN);
3429 
3430 	icmp = connp->conn_icmp;
3431 	is = icmp->icmp_is;
3432 	rptr = mp->b_rptr;
3433 	ASSERT(DB_TYPE(mp) == M_DATA || DB_TYPE(mp) == M_CTL);
3434 	ASSERT(OK_32PTR(rptr));
3435 
3436 	/*
3437 	 * IP should have prepended the options data in an M_CTL
3438 	 * Check M_CTL "type" to make sure are not here bcos of
3439 	 * a valid ICMP message
3440 	 */
3441 	if (DB_TYPE(mp) == M_CTL) {
3442 		/*
3443 		 * FIXME: does IP still do this?
3444 		 * IP sends up the IPSEC_IN message for handling IPSEC
3445 		 * policy at the TCP level. We don't need it here.
3446 		 */
3447 		if (*(uint32_t *)(mp->b_rptr) == IPSEC_IN) {
3448 			mp1 = mp->b_cont;
3449 			freeb(mp);
3450 			mp = mp1;
3451 			rptr = mp->b_rptr;
3452 		} else if (MBLKL(mp) == sizeof (ip_pktinfo_t) &&
3453 		    ((ip_pktinfo_t *)mp->b_rptr)->ip_pkt_ulp_type ==
3454 		    IN_PKTINFO) {
3455 			/*
3456 			 * IP_RECVIF or IP_RECVSLLA or IPF_RECVADDR information
3457 			 * has been prepended to the packet by IP. We need to
3458 			 * extract the mblk and adjust the rptr
3459 			 */
3460 			pinfo = (ip_pktinfo_t *)mp->b_rptr;
3461 			options_mp = mp;
3462 			mp = mp->b_cont;
3463 			rptr = mp->b_rptr;
3464 		} else {
3465 			/*
3466 			 * ICMP messages.
3467 			 */
3468 			icmp_icmp_error(connp, mp);
3469 			return;
3470 		}
3471 	}
3472 
3473 	/*
3474 	 * Discard message if it is misaligned or smaller than the IP header.
3475 	 */
3476 	if (!OK_32PTR(rptr) || (mp->b_wptr - rptr) < sizeof (ipha_t)) {
3477 		freemsg(mp);
3478 		if (options_mp != NULL)
3479 			freeb(options_mp);
3480 		BUMP_MIB(&is->is_rawip_mib, rawipInErrors);
3481 		return;
3482 	}
3483 	ipvers = IPH_HDR_VERSION((ipha_t *)rptr);
3484 
3485 	/* Handle M_DATA messages containing IP packets messages */
3486 	if (ipvers == IPV4_VERSION) {
3487 		/*
3488 		 * Special case where IP attaches
3489 		 * the IRE needs to be handled so that we don't send up
3490 		 * IRE to the user land.
3491 		 */
3492 		ipha = (ipha_t *)rptr;
3493 		hdr_len = IPH_HDR_LENGTH(ipha);
3494 
3495 		if (ipha->ipha_protocol == IPPROTO_TCP) {
3496 			tcph_t *tcph = (tcph_t *)&mp->b_rptr[hdr_len];
3497 
3498 			if (((tcph->th_flags[0] & (TH_SYN|TH_ACK)) ==
3499 			    TH_SYN) && mp->b_cont != NULL) {
3500 				mp1 = mp->b_cont;
3501 				if (mp1->b_datap->db_type == IRE_DB_TYPE) {
3502 					freeb(mp1);
3503 					mp->b_cont = NULL;
3504 				}
3505 			}
3506 		}
3507 		if (is->is_bsd_compat) {
3508 			ushort_t len;
3509 			len = ntohs(ipha->ipha_length);
3510 
3511 			if (mp->b_datap->db_ref > 1) {
3512 				/*
3513 				 * Allocate a new IP header so that we can
3514 				 * modify ipha_length.
3515 				 */
3516 				mblk_t	*mp1;
3517 
3518 				mp1 = allocb(hdr_len, BPRI_MED);
3519 				if (!mp1) {
3520 					freemsg(mp);
3521 					if (options_mp != NULL)
3522 						freeb(options_mp);
3523 					BUMP_MIB(&is->is_rawip_mib,
3524 					    rawipInErrors);
3525 					return;
3526 				}
3527 				bcopy(rptr, mp1->b_rptr, hdr_len);
3528 				mp->b_rptr = rptr + hdr_len;
3529 				rptr = mp1->b_rptr;
3530 				ipha = (ipha_t *)rptr;
3531 				mp1->b_cont = mp;
3532 				mp1->b_wptr = rptr + hdr_len;
3533 				mp = mp1;
3534 			}
3535 			len -= hdr_len;
3536 			ipha->ipha_length = htons(len);
3537 		}
3538 	}
3539 
3540 	/*
3541 	 * This is the inbound data path.  Packets are passed upstream as
3542 	 * T_UNITDATA_IND messages with full IP headers still attached.
3543 	 */
3544 	if (icmp->icmp_family == AF_INET) {
3545 		ASSERT(ipvers == IPV4_VERSION);
3546 		udi_size =  sizeof (struct T_unitdata_ind) + sizeof (sin_t);
3547 		if (icmp->icmp_recvif && (pinfo != NULL) &&
3548 		    (pinfo->ip_pkt_flags & IPF_RECVIF)) {
3549 			udi_size += sizeof (struct T_opthdr) +
3550 			    sizeof (uint_t);
3551 		}
3552 
3553 		if (icmp->icmp_ip_recvpktinfo && (pinfo != NULL) &&
3554 		    (pinfo->ip_pkt_flags & IPF_RECVADDR)) {
3555 			udi_size += sizeof (struct T_opthdr) +
3556 			    sizeof (struct in_pktinfo);
3557 		}
3558 
3559 		/*
3560 		 * If SO_TIMESTAMP is set allocate the appropriate sized
3561 		 * buffer. Since gethrestime() expects a pointer aligned
3562 		 * argument, we allocate space necessary for extra
3563 		 * alignment (even though it might not be used).
3564 		 */
3565 		if (icmp->icmp_timestamp) {
3566 			udi_size += sizeof (struct T_opthdr) +
3567 			    sizeof (timestruc_t) + _POINTER_ALIGNMENT;
3568 		}
3569 		mp1 = allocb(udi_size, BPRI_MED);
3570 		if (mp1 == NULL) {
3571 			freemsg(mp);
3572 			if (options_mp != NULL)
3573 				freeb(options_mp);
3574 			BUMP_MIB(&is->is_rawip_mib, rawipInErrors);
3575 			return;
3576 		}
3577 		mp1->b_cont = mp;
3578 		mp = mp1;
3579 		tudi = (struct T_unitdata_ind *)mp->b_rptr;
3580 		mp->b_datap->db_type = M_PROTO;
3581 		mp->b_wptr = (uchar_t *)tudi + udi_size;
3582 		tudi->PRIM_type = T_UNITDATA_IND;
3583 		tudi->SRC_length = sizeof (sin_t);
3584 		tudi->SRC_offset = sizeof (struct T_unitdata_ind);
3585 		sin = (sin_t *)&tudi[1];
3586 		*sin = sin_null;
3587 		sin->sin_family = AF_INET;
3588 		sin->sin_addr.s_addr = ipha->ipha_src;
3589 		tudi->OPT_offset =  sizeof (struct T_unitdata_ind) +
3590 		    sizeof (sin_t);
3591 		udi_size -= (sizeof (struct T_unitdata_ind) + sizeof (sin_t));
3592 		tudi->OPT_length = udi_size;
3593 
3594 		/*
3595 		 * Add options if IP_RECVIF is set
3596 		 */
3597 		if (udi_size != 0) {
3598 			char *dstopt;
3599 
3600 			dstopt = (char *)&sin[1];
3601 			if (icmp->icmp_recvif && (pinfo != NULL) &&
3602 			    (pinfo->ip_pkt_flags & IPF_RECVIF)) {
3603 
3604 				struct T_opthdr *toh;
3605 				uint_t		*dstptr;
3606 
3607 				toh = (struct T_opthdr *)dstopt;
3608 				toh->level = IPPROTO_IP;
3609 				toh->name = IP_RECVIF;
3610 				toh->len = sizeof (struct T_opthdr) +
3611 				    sizeof (uint_t);
3612 				toh->status = 0;
3613 				dstopt += sizeof (struct T_opthdr);
3614 				dstptr = (uint_t *)dstopt;
3615 				*dstptr = pinfo->ip_pkt_ifindex;
3616 				dstopt += sizeof (uint_t);
3617 				udi_size -= toh->len;
3618 			}
3619 			if (icmp->icmp_timestamp) {
3620 				struct	T_opthdr *toh;
3621 
3622 				toh = (struct T_opthdr *)dstopt;
3623 				toh->level = SOL_SOCKET;
3624 				toh->name = SCM_TIMESTAMP;
3625 				toh->len = sizeof (struct T_opthdr) +
3626 				    sizeof (timestruc_t) + _POINTER_ALIGNMENT;
3627 				toh->status = 0;
3628 				dstopt += sizeof (struct T_opthdr);
3629 				/* Align for gethrestime() */
3630 				dstopt = (char *)P2ROUNDUP((intptr_t)dstopt,
3631 				    sizeof (intptr_t));
3632 				gethrestime((timestruc_t *)dstopt);
3633 				dstopt = (char *)toh + toh->len;
3634 				udi_size -= toh->len;
3635 			}
3636 			if (icmp->icmp_ip_recvpktinfo && (pinfo != NULL) &&
3637 			    (pinfo->ip_pkt_flags & IPF_RECVADDR)) {
3638 				struct	T_opthdr *toh;
3639 				struct	in_pktinfo *pktinfop;
3640 
3641 				toh = (struct T_opthdr *)dstopt;
3642 				toh->level = IPPROTO_IP;
3643 				toh->name = IP_PKTINFO;
3644 				toh->len = sizeof (struct T_opthdr) +
3645 				    sizeof (in_pktinfo_t);
3646 				toh->status = 0;
3647 				dstopt += sizeof (struct T_opthdr);
3648 				pktinfop = (struct in_pktinfo *)dstopt;
3649 				pktinfop->ipi_ifindex = pinfo->ip_pkt_ifindex;
3650 				pktinfop->ipi_spec_dst =
3651 				    pinfo->ip_pkt_match_addr;
3652 
3653 				pktinfop->ipi_addr.s_addr = ipha->ipha_dst;
3654 
3655 				dstopt += sizeof (struct in_pktinfo);
3656 				udi_size -= toh->len;
3657 			}
3658 
3659 			/* Consumed all of allocated space */
3660 			ASSERT(udi_size == 0);
3661 		}
3662 
3663 		if (options_mp != NULL)
3664 			freeb(options_mp);
3665 
3666 		BUMP_MIB(&is->is_rawip_mib, rawipInDatagrams);
3667 		goto deliver;
3668 	}
3669 
3670 	/*
3671 	 * We don't need options_mp in the IPv6 path.
3672 	 */
3673 	if (options_mp != NULL) {
3674 		freeb(options_mp);
3675 		options_mp = NULL;
3676 	}
3677 
3678 	/*
3679 	 * Discard message if it is smaller than the IPv6 header
3680 	 * or if the header is malformed.
3681 	 */
3682 	if ((mp->b_wptr - rptr) < sizeof (ip6_t) ||
3683 	    IPH_HDR_VERSION((ipha_t *)rptr) != IPV6_VERSION ||
3684 	    icmp->icmp_family != AF_INET6) {
3685 		freemsg(mp);
3686 		BUMP_MIB(&is->is_rawip_mib, rawipInErrors);
3687 		return;
3688 	}
3689 
3690 	/* Initialize */
3691 	ipp.ipp_fields = 0;
3692 	hopstrip = 0;
3693 
3694 	ip6h = (ip6_t *)rptr;
3695 	/*
3696 	 * Call on ip_find_hdr_v6 which gets the total hdr len
3697 	 * as well as individual lenghts of ext hdrs (and ptrs to
3698 	 * them).
3699 	 */
3700 	if (ip6h->ip6_nxt != icmp->icmp_proto) {
3701 		/* Look for ifindex information */
3702 		if (ip6h->ip6_nxt == IPPROTO_RAW) {
3703 			ip6i = (ip6i_t *)ip6h;
3704 			if (ip6i->ip6i_flags & IP6I_IFINDEX) {
3705 				ASSERT(ip6i->ip6i_ifindex != 0);
3706 				ipp.ipp_fields |= IPPF_IFINDEX;
3707 				ipp.ipp_ifindex = ip6i->ip6i_ifindex;
3708 			}
3709 			rptr = (uchar_t *)&ip6i[1];
3710 			mp->b_rptr = rptr;
3711 			if (rptr == mp->b_wptr) {
3712 				mp1 = mp->b_cont;
3713 				freeb(mp);
3714 				mp = mp1;
3715 				rptr = mp->b_rptr;
3716 			}
3717 			ASSERT(mp->b_wptr - rptr >= IPV6_HDR_LEN);
3718 			ip6h = (ip6_t *)rptr;
3719 		}
3720 		hdr_len = ip_find_hdr_v6(mp, ip6h, &ipp, &nexthdr);
3721 
3722 		/*
3723 		 * We need to lie a bit to the user because users inside
3724 		 * labeled compartments should not see their own labels.  We
3725 		 * assume that in all other respects IP has checked the label,
3726 		 * and that the label is always first among the options.  (If
3727 		 * it's not first, then this code won't see it, and the option
3728 		 * will be passed along to the user.)
3729 		 *
3730 		 * If we had multilevel ICMP sockets, then the following code
3731 		 * should be skipped for them to allow the user to see the
3732 		 * label.
3733 		 *
3734 		 * Alignment restrictions in the definition of IP options
3735 		 * (namely, the requirement that the 4-octet DOI goes on a
3736 		 * 4-octet boundary) mean that we know exactly where the option
3737 		 * should start, but we're lenient for other hosts.
3738 		 *
3739 		 * Note that there are no multilevel ICMP or raw IP sockets
3740 		 * yet, thus nobody ever sees the IP6OPT_LS option.
3741 		 */
3742 		if ((ipp.ipp_fields & IPPF_HOPOPTS) &&
3743 		    ipp.ipp_hopoptslen > 5 && is_system_labeled()) {
3744 			const uchar_t *ucp =
3745 			    (const uchar_t *)ipp.ipp_hopopts + 2;
3746 			int remlen = ipp.ipp_hopoptslen - 2;
3747 
3748 			while (remlen > 0) {
3749 				if (*ucp == IP6OPT_PAD1) {
3750 					remlen--;
3751 					ucp++;
3752 				} else if (*ucp == IP6OPT_PADN) {
3753 					remlen -= ucp[1] + 2;
3754 					ucp += ucp[1] + 2;
3755 				} else if (*ucp == ip6opt_ls) {
3756 					hopstrip = (ucp -
3757 					    (const uchar_t *)ipp.ipp_hopopts) +
3758 					    ucp[1] + 2;
3759 					hopstrip = (hopstrip + 7) & ~7;
3760 					break;
3761 				} else {
3762 					/* label option must be first */
3763 					break;
3764 				}
3765 			}
3766 		}
3767 	} else {
3768 		hdr_len = IPV6_HDR_LEN;
3769 		ip6i = NULL;
3770 		nexthdr = ip6h->ip6_nxt;
3771 	}
3772 	/*
3773 	 * One special case where IP attaches the IRE needs to
3774 	 * be handled so that we don't send up IRE to the user land.
3775 	 */
3776 	if (nexthdr == IPPROTO_TCP) {
3777 		tcph_t *tcph = (tcph_t *)&mp->b_rptr[hdr_len];
3778 
3779 		if (((tcph->th_flags[0] & (TH_SYN|TH_ACK)) == TH_SYN) &&
3780 		    mp->b_cont != NULL) {
3781 			mp1 = mp->b_cont;
3782 			if (mp1->b_datap->db_type == IRE_DB_TYPE) {
3783 				freeb(mp1);
3784 				mp->b_cont = NULL;
3785 			}
3786 		}
3787 	}
3788 	/*
3789 	 * Check a filter for ICMPv6 types if needed.
3790 	 * Verify raw checksums if needed.
3791 	 */
3792 	if (icmp->icmp_filter != NULL || icmp->icmp_raw_checksum) {
3793 		if (icmp->icmp_filter != NULL) {
3794 			int type;
3795 
3796 			/* Assumes that IP has done the pullupmsg */
3797 			type = mp->b_rptr[hdr_len];
3798 
3799 			ASSERT(mp->b_rptr + hdr_len <= mp->b_wptr);
3800 			if (ICMP6_FILTER_WILLBLOCK(type, icmp->icmp_filter)) {
3801 				freemsg(mp);
3802 				return;
3803 			}
3804 		} else {
3805 			/* Checksum */
3806 			uint16_t	*up;
3807 			uint32_t	sum;
3808 			int		remlen;
3809 
3810 			up = (uint16_t *)&ip6h->ip6_src;
3811 
3812 			remlen = msgdsize(mp) - hdr_len;
3813 			sum = htons(icmp->icmp_proto + remlen)
3814 			    + up[0] + up[1] + up[2] + up[3]
3815 			    + up[4] + up[5] + up[6] + up[7]
3816 			    + up[8] + up[9] + up[10] + up[11]
3817 			    + up[12] + up[13] + up[14] + up[15];
3818 			sum = (sum & 0xffff) + (sum >> 16);
3819 			sum = IP_CSUM(mp, hdr_len, sum);
3820 			if (sum != 0) {
3821 				/* IPv6 RAW checksum failed */
3822 				ip0dbg(("icmp_rput: RAW checksum "
3823 				    "failed %x\n", sum));
3824 				freemsg(mp);
3825 				BUMP_MIB(&is->is_rawip_mib,
3826 				    rawipInCksumErrs);
3827 				return;
3828 			}
3829 		}
3830 	}
3831 	/* Skip all the IPv6 headers per API */
3832 	mp->b_rptr += hdr_len;
3833 
3834 	udi_size = sizeof (struct T_unitdata_ind) + sizeof (sin6_t);
3835 
3836 	/*
3837 	 * We use local variables icmp_opt and icmp_ipv6_recvhoplimit to
3838 	 * maintain state information, instead of relying on icmp_t
3839 	 * structure, since there arent any locks protecting these members
3840 	 * and there is a window where there might be a race between a
3841 	 * thread setting options on the write side and a thread reading
3842 	 * these options on the read size.
3843 	 */
3844 	if (ipp.ipp_fields & (IPPF_HOPOPTS|IPPF_DSTOPTS|IPPF_RTDSTOPTS|
3845 	    IPPF_RTHDR|IPPF_IFINDEX)) {
3846 		if (icmp->icmp_ipv6_recvhopopts &&
3847 		    (ipp.ipp_fields & IPPF_HOPOPTS) &&
3848 		    ipp.ipp_hopoptslen > hopstrip) {
3849 			udi_size += sizeof (struct T_opthdr) +
3850 			    ipp.ipp_hopoptslen - hopstrip;
3851 			icmp_opt |= IPPF_HOPOPTS;
3852 		}
3853 		if ((icmp->icmp_ipv6_recvdstopts ||
3854 		    icmp->icmp_old_ipv6_recvdstopts) &&
3855 		    (ipp.ipp_fields & IPPF_DSTOPTS)) {
3856 			udi_size += sizeof (struct T_opthdr) +
3857 			    ipp.ipp_dstoptslen;
3858 			icmp_opt |= IPPF_DSTOPTS;
3859 		}
3860 		if (((icmp->icmp_ipv6_recvdstopts &&
3861 		    icmp->icmp_ipv6_recvrthdr &&
3862 		    (ipp.ipp_fields & IPPF_RTHDR)) ||
3863 		    icmp->icmp_ipv6_recvrtdstopts) &&
3864 		    (ipp.ipp_fields & IPPF_RTDSTOPTS)) {
3865 			udi_size += sizeof (struct T_opthdr) +
3866 			    ipp.ipp_rtdstoptslen;
3867 			icmp_opt |= IPPF_RTDSTOPTS;
3868 		}
3869 		if (icmp->icmp_ipv6_recvrthdr &&
3870 		    (ipp.ipp_fields & IPPF_RTHDR)) {
3871 			udi_size += sizeof (struct T_opthdr) +
3872 			    ipp.ipp_rthdrlen;
3873 			icmp_opt |= IPPF_RTHDR;
3874 		}
3875 		if (icmp->icmp_ip_recvpktinfo &&
3876 		    (ipp.ipp_fields & IPPF_IFINDEX)) {
3877 			udi_size += sizeof (struct T_opthdr) +
3878 			    sizeof (struct in6_pktinfo);
3879 			icmp_opt |= IPPF_IFINDEX;
3880 		}
3881 	}
3882 	if (icmp->icmp_ipv6_recvhoplimit) {
3883 		udi_size += sizeof (struct T_opthdr) + sizeof (int);
3884 		icmp_ipv6_recvhoplimit = B_TRUE;
3885 	}
3886 
3887 	if (icmp->icmp_ipv6_recvtclass)
3888 		udi_size += sizeof (struct T_opthdr) + sizeof (int);
3889 
3890 	/*
3891 	 * If SO_TIMESTAMP is set allocate the appropriate sized
3892 	 * buffer. Since gethrestime() expects a pointer aligned
3893 	 * argument, we allocate space necessary for extra
3894 	 * alignment (even though it might not be used).
3895 	 */
3896 	if (icmp->icmp_timestamp) {
3897 		udi_size += sizeof (struct T_opthdr) +
3898 		    sizeof (timestruc_t) + _POINTER_ALIGNMENT;
3899 	}
3900 
3901 	mp1 = allocb(udi_size, BPRI_MED);
3902 	if (mp1 == NULL) {
3903 		freemsg(mp);
3904 		BUMP_MIB(&is->is_rawip_mib, rawipInErrors);
3905 		return;
3906 	}
3907 	mp1->b_cont = mp;
3908 	mp = mp1;
3909 	mp->b_datap->db_type = M_PROTO;
3910 	tudi = (struct T_unitdata_ind *)mp->b_rptr;
3911 	mp->b_wptr = (uchar_t *)tudi + udi_size;
3912 	tudi->PRIM_type = T_UNITDATA_IND;
3913 	tudi->SRC_length = sizeof (sin6_t);
3914 	tudi->SRC_offset = sizeof (struct T_unitdata_ind);
3915 	tudi->OPT_offset = sizeof (struct T_unitdata_ind) + sizeof (sin6_t);
3916 	udi_size -= (sizeof (struct T_unitdata_ind) + sizeof (sin6_t));
3917 	tudi->OPT_length = udi_size;
3918 	sin6 = (sin6_t *)&tudi[1];
3919 	sin6->sin6_port = 0;
3920 	sin6->sin6_family = AF_INET6;
3921 
3922 	sin6->sin6_addr = ip6h->ip6_src;
3923 	/* No sin6_flowinfo per API */
3924 	sin6->sin6_flowinfo = 0;
3925 	/* For link-scope source pass up scope id */
3926 	if ((ipp.ipp_fields & IPPF_IFINDEX) &&
3927 	    IN6_IS_ADDR_LINKSCOPE(&ip6h->ip6_src))
3928 		sin6->sin6_scope_id = ipp.ipp_ifindex;
3929 	else
3930 		sin6->sin6_scope_id = 0;
3931 
3932 	sin6->__sin6_src_id = ip_srcid_find_addr(&ip6h->ip6_dst,
3933 	    icmp->icmp_zoneid, is->is_netstack);
3934 
3935 	if (udi_size != 0) {
3936 		uchar_t *dstopt;
3937 
3938 		dstopt = (uchar_t *)&sin6[1];
3939 		if (icmp_opt & IPPF_IFINDEX) {
3940 			struct T_opthdr *toh;
3941 			struct in6_pktinfo *pkti;
3942 
3943 			toh = (struct T_opthdr *)dstopt;
3944 			toh->level = IPPROTO_IPV6;
3945 			toh->name = IPV6_PKTINFO;
3946 			toh->len = sizeof (struct T_opthdr) +
3947 			    sizeof (*pkti);
3948 			toh->status = 0;
3949 			dstopt += sizeof (struct T_opthdr);
3950 			pkti = (struct in6_pktinfo *)dstopt;
3951 			pkti->ipi6_addr = ip6h->ip6_dst;
3952 			pkti->ipi6_ifindex = ipp.ipp_ifindex;
3953 			dstopt += sizeof (*pkti);
3954 			udi_size -= toh->len;
3955 		}
3956 		if (icmp_ipv6_recvhoplimit) {
3957 			struct T_opthdr *toh;
3958 
3959 			toh = (struct T_opthdr *)dstopt;
3960 			toh->level = IPPROTO_IPV6;
3961 			toh->name = IPV6_HOPLIMIT;
3962 			toh->len = sizeof (struct T_opthdr) +
3963 			    sizeof (uint_t);
3964 			toh->status = 0;
3965 			dstopt += sizeof (struct T_opthdr);
3966 			*(uint_t *)dstopt = ip6h->ip6_hops;
3967 			dstopt += sizeof (uint_t);
3968 			udi_size -= toh->len;
3969 		}
3970 		if (icmp->icmp_ipv6_recvtclass) {
3971 			struct T_opthdr *toh;
3972 
3973 			toh = (struct T_opthdr *)dstopt;
3974 			toh->level = IPPROTO_IPV6;
3975 			toh->name = IPV6_TCLASS;
3976 			toh->len = sizeof (struct T_opthdr) +
3977 			    sizeof (uint_t);
3978 			toh->status = 0;
3979 			dstopt += sizeof (struct T_opthdr);
3980 			*(uint_t *)dstopt = IPV6_FLOW_TCLASS(ip6h->ip6_flow);
3981 			dstopt += sizeof (uint_t);
3982 			udi_size -= toh->len;
3983 		}
3984 		if (icmp->icmp_timestamp) {
3985 			struct  T_opthdr *toh;
3986 
3987 			toh = (struct T_opthdr *)dstopt;
3988 			toh->level = SOL_SOCKET;
3989 			toh->name = SCM_TIMESTAMP;
3990 			toh->len = sizeof (struct T_opthdr) +
3991 			    sizeof (timestruc_t) + _POINTER_ALIGNMENT;
3992 			toh->status = 0;
3993 			dstopt += sizeof (struct T_opthdr);
3994 			/* Align for gethrestime() */
3995 			dstopt = (uchar_t *)P2ROUNDUP((intptr_t)dstopt,
3996 			    sizeof (intptr_t));
3997 			gethrestime((timestruc_t *)dstopt);
3998 			dstopt = (uchar_t *)toh + toh->len;
3999 			udi_size -= toh->len;
4000 		}
4001 
4002 		if (icmp_opt & IPPF_HOPOPTS) {
4003 			struct T_opthdr *toh;
4004 
4005 			toh = (struct T_opthdr *)dstopt;
4006 			toh->level = IPPROTO_IPV6;
4007 			toh->name = IPV6_HOPOPTS;
4008 			toh->len = sizeof (struct T_opthdr) +
4009 			    ipp.ipp_hopoptslen - hopstrip;
4010 			toh->status = 0;
4011 			dstopt += sizeof (struct T_opthdr);
4012 			bcopy((char *)ipp.ipp_hopopts + hopstrip, dstopt,
4013 			    ipp.ipp_hopoptslen - hopstrip);
4014 			if (hopstrip > 0) {
4015 				/* copy next header value and fake length */
4016 				dstopt[0] = ((uchar_t *)ipp.ipp_hopopts)[0];
4017 				dstopt[1] = ((uchar_t *)ipp.ipp_hopopts)[1] -
4018 				    hopstrip / 8;
4019 			}
4020 			dstopt += ipp.ipp_hopoptslen - hopstrip;
4021 			udi_size -= toh->len;
4022 		}
4023 		if (icmp_opt & IPPF_RTDSTOPTS) {
4024 			struct T_opthdr *toh;
4025 
4026 			toh = (struct T_opthdr *)dstopt;
4027 			toh->level = IPPROTO_IPV6;
4028 			toh->name = IPV6_DSTOPTS;
4029 			toh->len = sizeof (struct T_opthdr) +
4030 			    ipp.ipp_rtdstoptslen;
4031 			toh->status = 0;
4032 			dstopt += sizeof (struct T_opthdr);
4033 			bcopy(ipp.ipp_rtdstopts, dstopt,
4034 			    ipp.ipp_rtdstoptslen);
4035 			dstopt += ipp.ipp_rtdstoptslen;
4036 			udi_size -= toh->len;
4037 		}
4038 		if (icmp_opt & IPPF_RTHDR) {
4039 			struct T_opthdr *toh;
4040 
4041 			toh = (struct T_opthdr *)dstopt;
4042 			toh->level = IPPROTO_IPV6;
4043 			toh->name = IPV6_RTHDR;
4044 			toh->len = sizeof (struct T_opthdr) +
4045 			    ipp.ipp_rthdrlen;
4046 			toh->status = 0;
4047 			dstopt += sizeof (struct T_opthdr);
4048 			bcopy(ipp.ipp_rthdr, dstopt, ipp.ipp_rthdrlen);
4049 			dstopt += ipp.ipp_rthdrlen;
4050 			udi_size -= toh->len;
4051 		}
4052 		if (icmp_opt & IPPF_DSTOPTS) {
4053 			struct T_opthdr *toh;
4054 
4055 			toh = (struct T_opthdr *)dstopt;
4056 			toh->level = IPPROTO_IPV6;
4057 			toh->name = IPV6_DSTOPTS;
4058 			toh->len = sizeof (struct T_opthdr) +
4059 			    ipp.ipp_dstoptslen;
4060 			toh->status = 0;
4061 			dstopt += sizeof (struct T_opthdr);
4062 			bcopy(ipp.ipp_dstopts, dstopt,
4063 			    ipp.ipp_dstoptslen);
4064 			dstopt += ipp.ipp_dstoptslen;
4065 			udi_size -= toh->len;
4066 		}
4067 		/* Consumed all of allocated space */
4068 		ASSERT(udi_size == 0);
4069 	}
4070 	BUMP_MIB(&is->is_rawip_mib, rawipInDatagrams);
4071 
4072 deliver:
4073 	icmp_ulp_recv(connp, mp);
4074 
4075 }
4076 
4077 /*
4078  * return SNMP stuff in buffer in mpdata
4079  */
4080 mblk_t *
4081 icmp_snmp_get(queue_t *q, mblk_t *mpctl)
4082 {
4083 	mblk_t			*mpdata;
4084 	struct opthdr		*optp;
4085 	conn_t			*connp = Q_TO_CONN(q);
4086 	icmp_stack_t		*is = connp->conn_netstack->netstack_icmp;
4087 	mblk_t			*mp2ctl;
4088 
4089 	/*
4090 	 * make a copy of the original message
4091 	 */
4092 	mp2ctl = copymsg(mpctl);
4093 
4094 	if (mpctl == NULL ||
4095 	    (mpdata = mpctl->b_cont) == NULL) {
4096 		freemsg(mpctl);
4097 		freemsg(mp2ctl);
4098 		return (0);
4099 	}
4100 
4101 	/* fixed length structure for IPv4 and IPv6 counters */
4102 	optp = (struct opthdr *)&mpctl->b_rptr[sizeof (struct T_optmgmt_ack)];
4103 	optp->level = EXPER_RAWIP;
4104 	optp->name = 0;
4105 	(void) snmp_append_data(mpdata, (char *)&is->is_rawip_mib,
4106 	    sizeof (is->is_rawip_mib));
4107 	optp->len = msgdsize(mpdata);
4108 	qreply(q, mpctl);
4109 
4110 	return (mp2ctl);
4111 }
4112 
4113 /*
4114  * Return 0 if invalid set request, 1 otherwise, including non-rawip requests.
4115  * TODO:  If this ever actually tries to set anything, it needs to be
4116  * to do the appropriate locking.
4117  */
4118 /* ARGSUSED */
4119 int
4120 icmp_snmp_set(queue_t *q, t_scalar_t level, t_scalar_t name,
4121     uchar_t *ptr, int len)
4122 {
4123 	switch (level) {
4124 	case EXPER_RAWIP:
4125 		return (0);
4126 	default:
4127 		return (1);
4128 	}
4129 }
4130 
4131 /*
4132  * This routine creates a T_UDERROR_IND message and passes it upstream.
4133  * The address and options are copied from the T_UNITDATA_REQ message
4134  * passed in mp.  This message is freed.
4135  */
4136 static void
4137 icmp_ud_err(queue_t *q, mblk_t *mp, t_scalar_t err)
4138 {
4139 	mblk_t	*mp1;
4140 	uchar_t	*rptr = mp->b_rptr;
4141 	struct T_unitdata_req *tudr = (struct T_unitdata_req *)rptr;
4142 
4143 	mp1 = mi_tpi_uderror_ind((char *)&rptr[tudr->DEST_offset],
4144 	    tudr->DEST_length, (char *)&rptr[tudr->OPT_offset],
4145 	    tudr->OPT_length, err);
4146 	if (mp1)
4147 		qreply(q, mp1);
4148 	freemsg(mp);
4149 }
4150 
4151 
4152 static int
4153 rawip_do_unbind(conn_t *connp)
4154 {
4155 	icmp_t *icmp = connp->conn_icmp;
4156 
4157 	rw_enter(&icmp->icmp_rwlock, RW_WRITER);
4158 	/* If a bind has not been done, we can't unbind. */
4159 	if (icmp->icmp_state == TS_UNBND || icmp->icmp_pending_op != -1) {
4160 		rw_exit(&icmp->icmp_rwlock);
4161 		return (-TOUTSTATE);
4162 	}
4163 	icmp->icmp_pending_op = T_UNBIND_REQ;
4164 	rw_exit(&icmp->icmp_rwlock);
4165 
4166 	/*
4167 	 * Call ip to unbind
4168 	 */
4169 
4170 	ip_unbind(connp);
4171 
4172 	/*
4173 	 * Once we're unbound from IP, the pending operation may be cleared
4174 	 * here.
4175 	 */
4176 	rw_enter(&icmp->icmp_rwlock, RW_WRITER);
4177 	V6_SET_ZERO(icmp->icmp_v6src);
4178 	V6_SET_ZERO(icmp->icmp_bound_v6src);
4179 	icmp->icmp_pending_op = -1;
4180 	icmp->icmp_state = TS_UNBND;
4181 	if (icmp->icmp_family == AF_INET6)
4182 		(void) icmp_build_hdrs(icmp);
4183 	rw_exit(&icmp->icmp_rwlock);
4184 	return (0);
4185 }
4186 
4187 /*
4188  * This routine is called by icmp_wput to handle T_UNBIND_REQ messages.
4189  * After some error checking, the message is passed downstream to ip.
4190  */
4191 static void
4192 icmp_tpi_unbind(queue_t *q, mblk_t *mp)
4193 {
4194 	conn_t	*connp = Q_TO_CONN(q);
4195 	int	error;
4196 
4197 	ASSERT(mp->b_cont == NULL);
4198 	error = rawip_do_unbind(connp);
4199 	if (error) {
4200 		if (error < 0) {
4201 			icmp_err_ack(q, mp, -error, 0);
4202 		} else {
4203 			icmp_err_ack(q, mp, 0, error);
4204 		}
4205 		return;
4206 	}
4207 
4208 	/*
4209 	 * Convert mp into a T_OK_ACK
4210 	 */
4211 
4212 	mp = mi_tpi_ok_ack_alloc(mp);
4213 
4214 	/*
4215 	 * should not happen in practice... T_OK_ACK is smaller than the
4216 	 * original message.
4217 	 */
4218 	ASSERT(mp != NULL);
4219 	ASSERT(((struct T_ok_ack *)mp->b_rptr)->PRIM_type == T_OK_ACK);
4220 	qreply(q, mp);
4221 }
4222 
4223 
4224 /*
4225  * Process IPv4 packets that already include an IP header.
4226  * Used when IP_HDRINCL has been set (implicit for IPPROTO_RAW and
4227  * IPPROTO_IGMP).
4228  */
4229 static int
4230 icmp_wput_hdrincl(queue_t *q, conn_t *connp, mblk_t *mp, icmp_t *icmp,
4231     ip4_pkt_t *pktinfop)
4232 {
4233 	icmp_stack_t *is = icmp->icmp_is;
4234 	ipha_t	*ipha;
4235 	int	ip_hdr_length;
4236 	int	tp_hdr_len;
4237 	mblk_t	*mp1;
4238 	uint_t	pkt_len;
4239 	ip_opt_info_t optinfo;
4240 
4241 	optinfo.ip_opt_flags = 0;
4242 	optinfo.ip_opt_ill_index = 0;
4243 	ipha = (ipha_t *)mp->b_rptr;
4244 	ip_hdr_length = IP_SIMPLE_HDR_LENGTH + icmp->icmp_ip_snd_options_len;
4245 	if ((mp->b_wptr - mp->b_rptr) < IP_SIMPLE_HDR_LENGTH) {
4246 		if (!pullupmsg(mp, IP_SIMPLE_HDR_LENGTH)) {
4247 			ASSERT(icmp != NULL);
4248 			BUMP_MIB(&is->is_rawip_mib, rawipOutErrors);
4249 			freemsg(mp);
4250 			return (0);
4251 		}
4252 		ipha = (ipha_t *)mp->b_rptr;
4253 	}
4254 	ipha->ipha_version_and_hdr_length =
4255 	    (IP_VERSION<<4) | (ip_hdr_length>>2);
4256 
4257 	/*
4258 	 * For the socket of SOCK_RAW type, the checksum is provided in the
4259 	 * pre-built packet. We set the ipha_ident field to IP_HDR_INCLUDED to
4260 	 * tell IP that the application has sent a complete IP header and not
4261 	 * to compute the transport checksum nor change the DF flag.
4262 	 */
4263 	ipha->ipha_ident = IP_HDR_INCLUDED;
4264 	ipha->ipha_hdr_checksum = 0;
4265 	ipha->ipha_fragment_offset_and_flags &= htons(IPH_DF);
4266 	/* Insert options if any */
4267 	if (ip_hdr_length > IP_SIMPLE_HDR_LENGTH) {
4268 		/*
4269 		 * Put the IP header plus any transport header that is
4270 		 * checksumed by ip_wput into the first mblk. (ip_wput assumes
4271 		 * that at least the checksum field is in the first mblk.)
4272 		 */
4273 		switch (ipha->ipha_protocol) {
4274 		case IPPROTO_UDP:
4275 			tp_hdr_len = 8;
4276 			break;
4277 		case IPPROTO_TCP:
4278 			tp_hdr_len = 20;
4279 			break;
4280 		default:
4281 			tp_hdr_len = 0;
4282 			break;
4283 		}
4284 		/*
4285 		 * The code below assumes that IP_SIMPLE_HDR_LENGTH plus
4286 		 * tp_hdr_len bytes will be in a single mblk.
4287 		 */
4288 		if ((mp->b_wptr - mp->b_rptr) < (IP_SIMPLE_HDR_LENGTH +
4289 		    tp_hdr_len)) {
4290 			if (!pullupmsg(mp, IP_SIMPLE_HDR_LENGTH +
4291 			    tp_hdr_len)) {
4292 				BUMP_MIB(&is->is_rawip_mib,
4293 				    rawipOutErrors);
4294 				freemsg(mp);
4295 				return (0);
4296 			}
4297 			ipha = (ipha_t *)mp->b_rptr;
4298 		}
4299 
4300 		/*
4301 		 * if the length is larger then the max allowed IP packet,
4302 		 * then send an error and abort the processing.
4303 		 */
4304 		pkt_len = ntohs(ipha->ipha_length)
4305 		    + icmp->icmp_ip_snd_options_len;
4306 		if (pkt_len > IP_MAXPACKET) {
4307 			return (EMSGSIZE);
4308 		}
4309 		if (!(mp1 = allocb(ip_hdr_length + is->is_wroff_extra +
4310 		    tp_hdr_len, BPRI_LO))) {
4311 			return (ENOMEM);
4312 		}
4313 		mp1->b_rptr += is->is_wroff_extra;
4314 		mp1->b_wptr = mp1->b_rptr + ip_hdr_length;
4315 
4316 		ipha->ipha_length = htons((uint16_t)pkt_len);
4317 		bcopy(ipha, mp1->b_rptr, IP_SIMPLE_HDR_LENGTH);
4318 
4319 		/* Copy transport header if any */
4320 		bcopy(&ipha[1], mp1->b_wptr, tp_hdr_len);
4321 		mp1->b_wptr += tp_hdr_len;
4322 
4323 		/* Add options */
4324 		ipha = (ipha_t *)mp1->b_rptr;
4325 		bcopy(icmp->icmp_ip_snd_options, &ipha[1],
4326 		    icmp->icmp_ip_snd_options_len);
4327 
4328 		/* Drop IP header and transport header from original */
4329 		(void) adjmsg(mp, IP_SIMPLE_HDR_LENGTH + tp_hdr_len);
4330 
4331 		mp1->b_cont = mp;
4332 		mp = mp1;
4333 		/*
4334 		 * Massage source route putting first source
4335 		 * route in ipha_dst.
4336 		 */
4337 		(void) ip_massage_options(ipha, is->is_netstack);
4338 	}
4339 
4340 	if (pktinfop != NULL) {
4341 		/*
4342 		 * Over write the source address provided in the header
4343 		 */
4344 		if (pktinfop->ip4_addr != INADDR_ANY) {
4345 			ipha->ipha_src = pktinfop->ip4_addr;
4346 			optinfo.ip_opt_flags = IP_VERIFY_SRC;
4347 		}
4348 
4349 		if (pktinfop->ip4_ill_index != 0) {
4350 			optinfo.ip_opt_ill_index = pktinfop->ip4_ill_index;
4351 		}
4352 	}
4353 
4354 	ip_output_options(connp, mp, q, IP_WPUT, &optinfo);
4355 	return (0);
4356 }
4357 
4358 static int
4359 icmp_update_label(icmp_t *icmp, mblk_t *mp, ipaddr_t dst)
4360 {
4361 	int err;
4362 	uchar_t opt_storage[IP_MAX_OPT_LENGTH];
4363 	icmp_stack_t		*is = icmp->icmp_is;
4364 	conn_t			*connp = icmp->icmp_connp;
4365 	cred_t			*cr;
4366 
4367 	/*
4368 	 * All Solaris components should pass a db_credp
4369 	 * for this message, hence we ASSERT.
4370 	 * On production kernels we return an error to be robust against
4371 	 * random streams modules sitting on top of us.
4372 	 */
4373 	cr = msg_getcred(mp, NULL);
4374 	ASSERT(cr != NULL);
4375 	if (cr == NULL)
4376 		return (EINVAL);
4377 
4378 	err = tsol_compute_label(cr, dst,
4379 	    opt_storage, connp->conn_mac_exempt,
4380 	    is->is_netstack->netstack_ip);
4381 	if (err == 0) {
4382 		err = tsol_update_options(&icmp->icmp_ip_snd_options,
4383 		    &icmp->icmp_ip_snd_options_len, &icmp->icmp_label_len,
4384 		    opt_storage);
4385 	}
4386 	if (err != 0) {
4387 		BUMP_MIB(&is->is_rawip_mib, rawipOutErrors);
4388 		DTRACE_PROBE4(
4389 		    tx__ip__log__drop__updatelabel__icmp,
4390 		    char *, "icmp(1) failed to update options(2) on mp(3)",
4391 		    icmp_t *, icmp, char *, opt_storage, mblk_t *, mp);
4392 		return (err);
4393 	}
4394 	IN6_IPADDR_TO_V4MAPPED(dst, &icmp->icmp_v6lastdst);
4395 	return (0);
4396 }
4397 
4398 /*
4399  * This routine handles all messages passed downstream.  It either
4400  * consumes the message or passes it downstream; it never queues a
4401  * a message.
4402  */
4403 static void
4404 icmp_wput(queue_t *q, mblk_t *mp)
4405 {
4406 	uchar_t	*rptr = mp->b_rptr;
4407 	ipha_t	*ipha;
4408 	mblk_t	*mp1;
4409 #define	tudr ((struct T_unitdata_req *)rptr)
4410 	size_t	ip_len;
4411 	conn_t	*connp = Q_TO_CONN(q);
4412 	icmp_t	*icmp = connp->conn_icmp;
4413 	icmp_stack_t *is = icmp->icmp_is;
4414 	sin6_t	*sin6;
4415 	sin_t	*sin;
4416 	ipaddr_t	v4dst;
4417 	ip4_pkt_t	pktinfo;
4418 	ip4_pkt_t	*pktinfop = &pktinfo;
4419 	ip6_pkt_t	ipp_s;  /* For ancillary data options */
4420 	ip6_pkt_t	*ipp = &ipp_s;
4421 	int error;
4422 
4423 	ipp->ipp_fields = 0;
4424 	ipp->ipp_sticky_ignored = 0;
4425 
4426 	switch (mp->b_datap->db_type) {
4427 	case M_DATA:
4428 		if (icmp->icmp_hdrincl) {
4429 			ASSERT(icmp->icmp_ipversion == IPV4_VERSION);
4430 			ipha = (ipha_t *)mp->b_rptr;
4431 			if (mp->b_wptr - mp->b_rptr < IP_SIMPLE_HDR_LENGTH) {
4432 				if (!pullupmsg(mp, IP_SIMPLE_HDR_LENGTH)) {
4433 					BUMP_MIB(&is->is_rawip_mib,
4434 					    rawipOutErrors);
4435 					freemsg(mp);
4436 					return;
4437 				}
4438 				ipha = (ipha_t *)mp->b_rptr;
4439 			}
4440 			/*
4441 			 * If this connection was used for v6 (inconceivable!)
4442 			 * or if we have a new destination, then it's time to
4443 			 * figure a new label.
4444 			 */
4445 			if (is_system_labeled() &&
4446 			    (!IN6_IS_ADDR_V4MAPPED(&icmp->icmp_v6lastdst) ||
4447 			    V4_PART_OF_V6(icmp->icmp_v6lastdst) !=
4448 			    ipha->ipha_dst)) {
4449 				error = icmp_update_label(icmp, mp,
4450 				    ipha->ipha_dst);
4451 				if (error != 0) {
4452 					icmp_ud_err(q, mp, error);
4453 					return;
4454 				}
4455 			}
4456 			error = icmp_wput_hdrincl(q, connp, mp, icmp, NULL);
4457 			if (error != 0)
4458 				icmp_ud_err(q, mp, error);
4459 			return;
4460 		}
4461 		freemsg(mp);
4462 		return;
4463 	case M_PROTO:
4464 	case M_PCPROTO:
4465 		ip_len = mp->b_wptr - rptr;
4466 		if (ip_len >= sizeof (struct T_unitdata_req)) {
4467 			/* Expedite valid T_UNITDATA_REQ to below the switch */
4468 			if (((union T_primitives *)rptr)->type
4469 			    == T_UNITDATA_REQ)
4470 				break;
4471 		}
4472 		/* FALLTHRU */
4473 	default:
4474 		icmp_wput_other(q, mp);
4475 		return;
4476 	}
4477 
4478 	/* Handle T_UNITDATA_REQ messages here. */
4479 
4480 	mp1 = mp->b_cont;
4481 	if (mp1 == NULL) {
4482 		BUMP_MIB(&is->is_rawip_mib, rawipOutErrors);
4483 		icmp_ud_err(q, mp, EPROTO);
4484 		return;
4485 	}
4486 
4487 	if ((rptr + tudr->DEST_offset + tudr->DEST_length) > mp->b_wptr) {
4488 		BUMP_MIB(&is->is_rawip_mib, rawipOutErrors);
4489 		icmp_ud_err(q, mp, EADDRNOTAVAIL);
4490 		return;
4491 	}
4492 
4493 	switch (icmp->icmp_family) {
4494 	case AF_INET6:
4495 		sin6 = (sin6_t *)&rptr[tudr->DEST_offset];
4496 		if (!OK_32PTR((char *)sin6) ||
4497 		    tudr->DEST_length != sizeof (sin6_t) ||
4498 		    sin6->sin6_family != AF_INET6) {
4499 			BUMP_MIB(&is->is_rawip_mib, rawipOutErrors);
4500 			icmp_ud_err(q, mp, EADDRNOTAVAIL);
4501 			return;
4502 		}
4503 
4504 		/* No support for mapped addresses on raw sockets */
4505 		if (IN6_IS_ADDR_V4MAPPED(&sin6->sin6_addr)) {
4506 			BUMP_MIB(&is->is_rawip_mib, rawipOutErrors);
4507 			icmp_ud_err(q, mp, EADDRNOTAVAIL);
4508 			return;
4509 		}
4510 
4511 		/*
4512 		 * Destination is a native IPv6 address.
4513 		 * Send out an IPv6 format packet.
4514 		 */
4515 		if (tudr->OPT_length != 0) {
4516 			int error;
4517 
4518 			error = 0;
4519 			if (icmp_unitdata_opt_process(q, mp, &error,
4520 			    (void *)ipp) < 0) {
4521 				/* failure */
4522 				BUMP_MIB(&is->is_rawip_mib, rawipOutErrors);
4523 				icmp_ud_err(q, mp, error);
4524 				return;
4525 			}
4526 			ASSERT(error == 0);
4527 		}
4528 
4529 		error = raw_ip_send_data_v6(q, connp, mp1, sin6, ipp);
4530 		goto done;
4531 
4532 	case AF_INET:
4533 		sin = (sin_t *)&rptr[tudr->DEST_offset];
4534 		if (!OK_32PTR((char *)sin) ||
4535 		    tudr->DEST_length != sizeof (sin_t) ||
4536 		    sin->sin_family != AF_INET) {
4537 			BUMP_MIB(&is->is_rawip_mib, rawipOutErrors);
4538 			icmp_ud_err(q, mp, EADDRNOTAVAIL);
4539 			return;
4540 		}
4541 		/* Extract and ipaddr */
4542 		v4dst = sin->sin_addr.s_addr;
4543 		break;
4544 
4545 	default:
4546 		ASSERT(0);
4547 	}
4548 
4549 	pktinfop->ip4_ill_index = 0;
4550 	pktinfop->ip4_addr = INADDR_ANY;
4551 
4552 	/*
4553 	 * If options passed in, feed it for verification and handling
4554 	 */
4555 	if (tudr->OPT_length != 0) {
4556 		int error;
4557 
4558 		error = 0;
4559 		if (icmp_unitdata_opt_process(q, mp, &error,
4560 		    (void *)pktinfop) < 0) {
4561 			/* failure */
4562 			BUMP_MIB(&is->is_rawip_mib, rawipOutErrors);
4563 			icmp_ud_err(q, mp, error);
4564 			return;
4565 		}
4566 		ASSERT(error == 0);
4567 		/*
4568 		 * Note: Success in processing options.
4569 		 * mp option buffer represented by
4570 		 * OPT_length/offset now potentially modified
4571 		 * and contain option setting results
4572 		 */
4573 	}
4574 
4575 	error = raw_ip_send_data_v4(q, connp, mp1, v4dst, pktinfop);
4576 done:
4577 	if (error != 0) {
4578 		icmp_ud_err(q, mp, error);
4579 		return;
4580 	} else {
4581 		mp->b_cont = NULL;
4582 		freeb(mp);
4583 	}
4584 }
4585 
4586 
4587 /* ARGSUSED */
4588 static void
4589 icmp_wput_fallback(queue_t *q, mblk_t *mp)
4590 {
4591 #ifdef DEBUG
4592 	cmn_err(CE_CONT, "icmp_wput_fallback: Message during fallback \n");
4593 #endif
4594 	freemsg(mp);
4595 }
4596 
4597 static int
4598 raw_ip_send_data_v4(queue_t *q, conn_t *connp, mblk_t *mp, ipaddr_t v4dst,
4599     ip4_pkt_t *pktinfop)
4600 {
4601 	ipha_t	*ipha;
4602 	size_t	ip_len;
4603 	icmp_t	*icmp = connp->conn_icmp;
4604 	icmp_stack_t *is = icmp->icmp_is;
4605 	int	ip_hdr_length;
4606 	ip_opt_info_t	optinfo;
4607 
4608 	optinfo.ip_opt_flags = 0;
4609 	optinfo.ip_opt_ill_index = 0;
4610 
4611 	if (icmp->icmp_state == TS_UNBND) {
4612 		/* If a port has not been bound to the stream, fail. */
4613 		BUMP_MIB(&is->is_rawip_mib, rawipOutErrors);
4614 		return (EPROTO);
4615 	}
4616 
4617 	if (v4dst == INADDR_ANY)
4618 		v4dst = htonl(INADDR_LOOPBACK);
4619 
4620 	/* Check if our saved options are valid; update if not */
4621 	if (is_system_labeled() &&
4622 	    (!IN6_IS_ADDR_V4MAPPED(&icmp->icmp_v6lastdst) ||
4623 	    V4_PART_OF_V6(icmp->icmp_v6lastdst) != v4dst)) {
4624 		int error = icmp_update_label(icmp, mp, v4dst);
4625 
4626 		if (error != 0)
4627 			return (error);
4628 	}
4629 
4630 	/* Protocol 255 contains full IP headers */
4631 	if (icmp->icmp_hdrincl)
4632 		return (icmp_wput_hdrincl(q, connp, mp, icmp, pktinfop));
4633 
4634 	/* Add an IP header */
4635 	ip_hdr_length = IP_SIMPLE_HDR_LENGTH + icmp->icmp_ip_snd_options_len;
4636 	ipha = (ipha_t *)&mp->b_rptr[-ip_hdr_length];
4637 	if ((uchar_t *)ipha < mp->b_datap->db_base ||
4638 	    mp->b_datap->db_ref != 1 ||
4639 	    !OK_32PTR(ipha)) {
4640 		mblk_t	*mp1;
4641 		if (!(mp1 = allocb(ip_hdr_length + is->is_wroff_extra,
4642 		    BPRI_LO))) {
4643 			BUMP_MIB(&is->is_rawip_mib, rawipOutErrors);
4644 			return (ENOMEM);
4645 		}
4646 		mp1->b_cont = mp;
4647 		ipha = (ipha_t *)mp1->b_datap->db_lim;
4648 		mp1->b_wptr = (uchar_t *)ipha;
4649 		ipha = (ipha_t *)((uchar_t *)ipha - ip_hdr_length);
4650 		mp = mp1;
4651 	}
4652 #ifdef	_BIG_ENDIAN
4653 	/* Set version, header length, and tos */
4654 	*(uint16_t *)&ipha->ipha_version_and_hdr_length =
4655 	    ((((IP_VERSION << 4) | (ip_hdr_length>>2)) << 8) |
4656 	    icmp->icmp_type_of_service);
4657 	/* Set ttl and protocol */
4658 	*(uint16_t *)&ipha->ipha_ttl = (icmp->icmp_ttl << 8) | icmp->icmp_proto;
4659 #else
4660 	/* Set version, header length, and tos */
4661 	*(uint16_t *)&ipha->ipha_version_and_hdr_length =
4662 	    ((icmp->icmp_type_of_service << 8) |
4663 	    ((IP_VERSION << 4) | (ip_hdr_length>>2)));
4664 	/* Set ttl and protocol */
4665 	*(uint16_t *)&ipha->ipha_ttl = (icmp->icmp_proto << 8) | icmp->icmp_ttl;
4666 #endif
4667 	if (pktinfop->ip4_addr != INADDR_ANY) {
4668 		ipha->ipha_src = pktinfop->ip4_addr;
4669 		optinfo.ip_opt_flags = IP_VERIFY_SRC;
4670 	} else {
4671 
4672 		/*
4673 		 * Copy our address into the packet.  If this is zero,
4674 		 * ip will fill in the real source address.
4675 		 */
4676 		IN6_V4MAPPED_TO_IPADDR(&icmp->icmp_v6src, ipha->ipha_src);
4677 	}
4678 
4679 	ipha->ipha_fragment_offset_and_flags = 0;
4680 
4681 	if (pktinfop->ip4_ill_index != 0) {
4682 		optinfo.ip_opt_ill_index = pktinfop->ip4_ill_index;
4683 	}
4684 
4685 
4686 	/*
4687 	 * For the socket of SOCK_RAW type, the checksum is provided in the
4688 	 * pre-built packet. We set the ipha_ident field to IP_HDR_INCLUDED to
4689 	 * tell IP that the application has sent a complete IP header and not
4690 	 * to compute the transport checksum nor change the DF flag.
4691 	 */
4692 	ipha->ipha_ident = IP_HDR_INCLUDED;
4693 
4694 	/* Finish common formatting of the packet. */
4695 	mp->b_rptr = (uchar_t *)ipha;
4696 
4697 	ip_len = mp->b_wptr - (uchar_t *)ipha;
4698 	if (mp->b_cont != NULL)
4699 		ip_len += msgdsize(mp->b_cont);
4700 
4701 	/*
4702 	 * Set the length into the IP header.
4703 	 * If the length is greater than the maximum allowed by IP,
4704 	 * then free the message and return. Do not try and send it
4705 	 * as this can cause problems in layers below.
4706 	 */
4707 	if (ip_len > IP_MAXPACKET) {
4708 		BUMP_MIB(&is->is_rawip_mib, rawipOutErrors);
4709 		return (EMSGSIZE);
4710 	}
4711 	ipha->ipha_length = htons((uint16_t)ip_len);
4712 	/*
4713 	 * Copy in the destination address request
4714 	 */
4715 	ipha->ipha_dst = v4dst;
4716 
4717 	/*
4718 	 * Set ttl based on IP_MULTICAST_TTL to match IPv6 logic.
4719 	 */
4720 	if (CLASSD(v4dst))
4721 		ipha->ipha_ttl = icmp->icmp_multicast_ttl;
4722 
4723 	/* Copy in options if any */
4724 	if (ip_hdr_length > IP_SIMPLE_HDR_LENGTH) {
4725 		bcopy(icmp->icmp_ip_snd_options,
4726 		    &ipha[1], icmp->icmp_ip_snd_options_len);
4727 		/*
4728 		 * Massage source route putting first source route in ipha_dst.
4729 		 * Ignore the destination in the T_unitdata_req.
4730 		 */
4731 		(void) ip_massage_options(ipha, is->is_netstack);
4732 	}
4733 
4734 	BUMP_MIB(&is->is_rawip_mib, rawipOutDatagrams);
4735 	ip_output_options(connp, mp, q, IP_WPUT, &optinfo);
4736 	return (0);
4737 }
4738 
4739 static int
4740 icmp_update_label_v6(icmp_t *icmp, mblk_t *mp, in6_addr_t *dst)
4741 {
4742 	int err;
4743 	uchar_t opt_storage[TSOL_MAX_IPV6_OPTION];
4744 	icmp_stack_t		*is = icmp->icmp_is;
4745 	conn_t			*connp = icmp->icmp_connp;
4746 	cred_t			*cr;
4747 
4748 	/*
4749 	 * All Solaris components should pass a db_credp
4750 	 * for this message, hence we ASSERT.
4751 	 * On production kernels we return an error to be robust against
4752 	 * random streams modules sitting on top of us.
4753 	 */
4754 	cr = msg_getcred(mp, NULL);
4755 	ASSERT(cr != NULL);
4756 	if (cr == NULL)
4757 		return (EINVAL);
4758 
4759 	err = tsol_compute_label_v6(cr, dst,
4760 	    opt_storage, connp->conn_mac_exempt,
4761 	    is->is_netstack->netstack_ip);
4762 	if (err == 0) {
4763 		err = tsol_update_sticky(&icmp->icmp_sticky_ipp,
4764 		    &icmp->icmp_label_len_v6, opt_storage);
4765 	}
4766 	if (err != 0) {
4767 		BUMP_MIB(&is->is_rawip_mib, rawipOutErrors);
4768 		DTRACE_PROBE4(
4769 		    tx__ip__log__drop__updatelabel__icmp6,
4770 		    char *, "icmp(1) failed to update options(2) on mp(3)",
4771 		    icmp_t *, icmp, char *, opt_storage, mblk_t *, mp);
4772 		return (err);
4773 	}
4774 
4775 	icmp->icmp_v6lastdst = *dst;
4776 	return (0);
4777 }
4778 
4779 /*
4780  * raw_ip_send_data_v6():
4781  * Assumes that icmp_wput did some sanity checking on the destination
4782  * address, but that the label may not yet be correct.
4783  */
4784 static int
4785 raw_ip_send_data_v6(queue_t *q, conn_t *connp, mblk_t *mp, sin6_t *sin6,
4786     ip6_pkt_t *ipp)
4787 {
4788 	ip6_t			*ip6h;
4789 	ip6i_t			*ip6i;	/* mp->b_rptr even if no ip6i_t */
4790 	int			ip_hdr_len = IPV6_HDR_LEN;
4791 	size_t			ip_len;
4792 	icmp_t			*icmp = connp->conn_icmp;
4793 	icmp_stack_t		*is = icmp->icmp_is;
4794 	ip6_pkt_t		*tipp;
4795 	uint32_t		csum = 0;
4796 	uint_t			ignore = 0;
4797 	uint_t			option_exists = 0, is_sticky = 0;
4798 	uint8_t			*cp;
4799 	uint8_t			*nxthdr_ptr;
4800 	in6_addr_t		ip6_dst;
4801 
4802 	/*
4803 	 * If the local address is a mapped address return
4804 	 * an error.
4805 	 * It would be possible to send an IPv6 packet but the
4806 	 * response would never make it back to the application
4807 	 * since it is bound to a mapped address.
4808 	 */
4809 	if (IN6_IS_ADDR_V4MAPPED(&icmp->icmp_v6src)) {
4810 		BUMP_MIB(&is->is_rawip_mib, rawipOutErrors);
4811 		return (EADDRNOTAVAIL);
4812 	}
4813 
4814 	ignore = ipp->ipp_sticky_ignored;
4815 	if (sin6->sin6_scope_id != 0 &&
4816 	    IN6_IS_ADDR_LINKSCOPE(&sin6->sin6_addr)) {
4817 		/*
4818 		 * IPPF_SCOPE_ID is special.  It's neither a sticky
4819 		 * option nor ancillary data.  It needs to be
4820 		 * explicitly set in options_exists.
4821 		 */
4822 		option_exists |= IPPF_SCOPE_ID;
4823 	}
4824 
4825 	/*
4826 	 * Compute the destination address
4827 	 */
4828 	ip6_dst = sin6->sin6_addr;
4829 	if (IN6_IS_ADDR_UNSPECIFIED(&sin6->sin6_addr))
4830 		ip6_dst = ipv6_loopback;
4831 
4832 	/*
4833 	 * If we're not going to the same destination as last time, then
4834 	 * recompute the label required.  This is done in a separate routine to
4835 	 * avoid blowing up our stack here.
4836 	 */
4837 	if (is_system_labeled() &&
4838 	    !IN6_ARE_ADDR_EQUAL(&icmp->icmp_v6lastdst, &ip6_dst)) {
4839 		int error = 0;
4840 
4841 		error = icmp_update_label_v6(icmp, mp, &ip6_dst);
4842 		if (error != 0)
4843 			return (error);
4844 	}
4845 
4846 	/*
4847 	 * If there's a security label here, then we ignore any options the
4848 	 * user may try to set.  We keep the peer's label as a hidden sticky
4849 	 * option.
4850 	 */
4851 	if (icmp->icmp_label_len_v6 > 0) {
4852 		ignore &= ~IPPF_HOPOPTS;
4853 		ipp->ipp_fields &= ~IPPF_HOPOPTS;
4854 	}
4855 
4856 	if ((icmp->icmp_sticky_ipp.ipp_fields == 0) &&
4857 	    (ipp->ipp_fields == 0)) {
4858 		/* No sticky options nor ancillary data. */
4859 		goto no_options;
4860 	}
4861 
4862 	/*
4863 	 * Go through the options figuring out where each is going to
4864 	 * come from and build two masks.  The first mask indicates if
4865 	 * the option exists at all.  The second mask indicates if the
4866 	 * option is sticky or ancillary.
4867 	 */
4868 	if (!(ignore & IPPF_HOPOPTS)) {
4869 		if (ipp->ipp_fields & IPPF_HOPOPTS) {
4870 			option_exists |= IPPF_HOPOPTS;
4871 			ip_hdr_len += ipp->ipp_hopoptslen;
4872 		} else if (icmp->icmp_sticky_ipp.ipp_fields & IPPF_HOPOPTS) {
4873 			option_exists |= IPPF_HOPOPTS;
4874 			is_sticky |= IPPF_HOPOPTS;
4875 			ip_hdr_len += icmp->icmp_sticky_ipp.ipp_hopoptslen;
4876 		}
4877 	}
4878 
4879 	if (!(ignore & IPPF_RTHDR)) {
4880 		if (ipp->ipp_fields & IPPF_RTHDR) {
4881 			option_exists |= IPPF_RTHDR;
4882 			ip_hdr_len += ipp->ipp_rthdrlen;
4883 		} else if (icmp->icmp_sticky_ipp.ipp_fields & IPPF_RTHDR) {
4884 			option_exists |= IPPF_RTHDR;
4885 			is_sticky |= IPPF_RTHDR;
4886 			ip_hdr_len += icmp->icmp_sticky_ipp.ipp_rthdrlen;
4887 		}
4888 	}
4889 
4890 	if (!(ignore & IPPF_RTDSTOPTS) && (option_exists & IPPF_RTHDR)) {
4891 		/*
4892 		 * Need to have a router header to use these.
4893 		 */
4894 		if (ipp->ipp_fields & IPPF_RTDSTOPTS) {
4895 			option_exists |= IPPF_RTDSTOPTS;
4896 			ip_hdr_len += ipp->ipp_rtdstoptslen;
4897 		} else if (icmp->icmp_sticky_ipp.ipp_fields & IPPF_RTDSTOPTS) {
4898 			option_exists |= IPPF_RTDSTOPTS;
4899 			is_sticky |= IPPF_RTDSTOPTS;
4900 			ip_hdr_len +=
4901 			    icmp->icmp_sticky_ipp.ipp_rtdstoptslen;
4902 		}
4903 	}
4904 
4905 	if (!(ignore & IPPF_DSTOPTS)) {
4906 		if (ipp->ipp_fields & IPPF_DSTOPTS) {
4907 			option_exists |= IPPF_DSTOPTS;
4908 			ip_hdr_len += ipp->ipp_dstoptslen;
4909 		} else if (icmp->icmp_sticky_ipp.ipp_fields & IPPF_DSTOPTS) {
4910 			option_exists |= IPPF_DSTOPTS;
4911 			is_sticky |= IPPF_DSTOPTS;
4912 			ip_hdr_len += icmp->icmp_sticky_ipp.ipp_dstoptslen;
4913 		}
4914 	}
4915 
4916 	if (!(ignore & IPPF_IFINDEX)) {
4917 		if (ipp->ipp_fields & IPPF_IFINDEX) {
4918 			option_exists |= IPPF_IFINDEX;
4919 		} else if (icmp->icmp_sticky_ipp.ipp_fields & IPPF_IFINDEX) {
4920 			option_exists |= IPPF_IFINDEX;
4921 			is_sticky |= IPPF_IFINDEX;
4922 		}
4923 	}
4924 
4925 	if (!(ignore & IPPF_ADDR)) {
4926 		if (ipp->ipp_fields & IPPF_ADDR) {
4927 			option_exists |= IPPF_ADDR;
4928 		} else if (icmp->icmp_sticky_ipp.ipp_fields & IPPF_ADDR) {
4929 			option_exists |= IPPF_ADDR;
4930 			is_sticky |= IPPF_ADDR;
4931 		}
4932 	}
4933 
4934 	if (!(ignore & IPPF_DONTFRAG)) {
4935 		if (ipp->ipp_fields & IPPF_DONTFRAG) {
4936 			option_exists |= IPPF_DONTFRAG;
4937 		} else if (icmp->icmp_sticky_ipp.ipp_fields & IPPF_DONTFRAG) {
4938 			option_exists |= IPPF_DONTFRAG;
4939 			is_sticky |= IPPF_DONTFRAG;
4940 		}
4941 	}
4942 
4943 	if (!(ignore & IPPF_USE_MIN_MTU)) {
4944 		if (ipp->ipp_fields & IPPF_USE_MIN_MTU) {
4945 			option_exists |= IPPF_USE_MIN_MTU;
4946 		} else if (icmp->icmp_sticky_ipp.ipp_fields &
4947 		    IPPF_USE_MIN_MTU) {
4948 			option_exists |= IPPF_USE_MIN_MTU;
4949 			is_sticky |= IPPF_USE_MIN_MTU;
4950 		}
4951 	}
4952 
4953 	if (!(ignore & IPPF_NEXTHOP)) {
4954 		if (ipp->ipp_fields & IPPF_NEXTHOP) {
4955 			option_exists |= IPPF_NEXTHOP;
4956 		} else if (icmp->icmp_sticky_ipp.ipp_fields & IPPF_NEXTHOP) {
4957 			option_exists |= IPPF_NEXTHOP;
4958 			is_sticky |= IPPF_NEXTHOP;
4959 		}
4960 	}
4961 
4962 	if (!(ignore & IPPF_HOPLIMIT) && (ipp->ipp_fields & IPPF_HOPLIMIT))
4963 		option_exists |= IPPF_HOPLIMIT;
4964 	/* IPV6_HOPLIMIT can never be sticky */
4965 	ASSERT(!(icmp->icmp_sticky_ipp.ipp_fields & IPPF_HOPLIMIT));
4966 
4967 	if (!(ignore & IPPF_UNICAST_HOPS) &&
4968 	    (icmp->icmp_sticky_ipp.ipp_fields & IPPF_UNICAST_HOPS)) {
4969 		option_exists |= IPPF_UNICAST_HOPS;
4970 		is_sticky |= IPPF_UNICAST_HOPS;
4971 	}
4972 
4973 	if (!(ignore & IPPF_MULTICAST_HOPS) &&
4974 	    (icmp->icmp_sticky_ipp.ipp_fields & IPPF_MULTICAST_HOPS)) {
4975 		option_exists |= IPPF_MULTICAST_HOPS;
4976 		is_sticky |= IPPF_MULTICAST_HOPS;
4977 	}
4978 
4979 	if (icmp->icmp_sticky_ipp.ipp_fields & IPPF_NO_CKSUM) {
4980 		/* This is a sticky socket option only */
4981 		option_exists |= IPPF_NO_CKSUM;
4982 		is_sticky |= IPPF_NO_CKSUM;
4983 	}
4984 
4985 	if (icmp->icmp_sticky_ipp.ipp_fields & IPPF_RAW_CKSUM) {
4986 		/* This is a sticky socket option only */
4987 		option_exists |= IPPF_RAW_CKSUM;
4988 		is_sticky |= IPPF_RAW_CKSUM;
4989 	}
4990 
4991 	if (!(ignore & IPPF_TCLASS)) {
4992 		if (ipp->ipp_fields & IPPF_TCLASS) {
4993 			option_exists |= IPPF_TCLASS;
4994 		} else if (icmp->icmp_sticky_ipp.ipp_fields & IPPF_TCLASS) {
4995 			option_exists |= IPPF_TCLASS;
4996 			is_sticky |= IPPF_TCLASS;
4997 		}
4998 	}
4999 
5000 no_options:
5001 
5002 	/*
5003 	 * If any options carried in the ip6i_t were specified, we
5004 	 * need to account for the ip6i_t in the data we'll be sending
5005 	 * down.
5006 	 */
5007 	if (option_exists & IPPF_HAS_IP6I)
5008 		ip_hdr_len += sizeof (ip6i_t);
5009 
5010 	/* check/fix buffer config, setup pointers into it */
5011 	ip6h = (ip6_t *)&mp->b_rptr[-ip_hdr_len];
5012 	if ((mp->b_datap->db_ref != 1) ||
5013 	    ((unsigned char *)ip6h < mp->b_datap->db_base) ||
5014 	    !OK_32PTR(ip6h)) {
5015 		mblk_t	*mp1;
5016 
5017 		/* Try to get everything in a single mblk next time */
5018 		if (ip_hdr_len > icmp->icmp_max_hdr_len) {
5019 			icmp->icmp_max_hdr_len = ip_hdr_len;
5020 
5021 			(void) proto_set_tx_wroff(q == NULL ? NULL:RD(q), connp,
5022 			    icmp->icmp_max_hdr_len + is->is_wroff_extra);
5023 		}
5024 		mp1 = allocb(ip_hdr_len + is->is_wroff_extra, BPRI_LO);
5025 		if (!mp1) {
5026 			BUMP_MIB(&is->is_rawip_mib, rawipOutErrors);
5027 			return (ENOMEM);
5028 		}
5029 		mp1->b_cont = mp;
5030 		mp1->b_wptr = mp1->b_datap->db_lim;
5031 		ip6h = (ip6_t *)(mp1->b_wptr - ip_hdr_len);
5032 		mp = mp1;
5033 	}
5034 	mp->b_rptr = (unsigned char *)ip6h;
5035 	ip6i = (ip6i_t *)ip6h;
5036 
5037 #define	ANCIL_OR_STICKY_PTR(f) ((is_sticky & f) ? &icmp->icmp_sticky_ipp : ipp)
5038 	if (option_exists & IPPF_HAS_IP6I) {
5039 		ip6h = (ip6_t *)&ip6i[1];
5040 		ip6i->ip6i_flags = 0;
5041 		ip6i->ip6i_vcf = IPV6_DEFAULT_VERS_AND_FLOW;
5042 
5043 		/* sin6_scope_id takes precendence over IPPF_IFINDEX */
5044 		if (option_exists & IPPF_SCOPE_ID) {
5045 			ip6i->ip6i_flags |= IP6I_IFINDEX;
5046 			ip6i->ip6i_ifindex = sin6->sin6_scope_id;
5047 		} else if (option_exists & IPPF_IFINDEX) {
5048 			tipp = ANCIL_OR_STICKY_PTR(IPPF_IFINDEX);
5049 			ASSERT(tipp->ipp_ifindex != 0);
5050 			ip6i->ip6i_flags |= IP6I_IFINDEX;
5051 			ip6i->ip6i_ifindex = tipp->ipp_ifindex;
5052 		}
5053 
5054 		if (option_exists & IPPF_RAW_CKSUM) {
5055 			ip6i->ip6i_flags |= IP6I_RAW_CHECKSUM;
5056 			ip6i->ip6i_checksum_off = icmp->icmp_checksum_off;
5057 		}
5058 
5059 		if (option_exists & IPPF_NO_CKSUM) {
5060 			ip6i->ip6i_flags |= IP6I_NO_ULP_CKSUM;
5061 		}
5062 
5063 		if (option_exists & IPPF_ADDR) {
5064 			/*
5065 			 * Enable per-packet source address verification if
5066 			 * IPV6_PKTINFO specified the source address.
5067 			 * ip6_src is set in the transport's _wput function.
5068 			 */
5069 			ip6i->ip6i_flags |= IP6I_VERIFY_SRC;
5070 		}
5071 
5072 		if (option_exists & IPPF_DONTFRAG) {
5073 			ip6i->ip6i_flags |= IP6I_DONTFRAG;
5074 		}
5075 
5076 		if (option_exists & IPPF_USE_MIN_MTU) {
5077 			ip6i->ip6i_flags = IP6I_API_USE_MIN_MTU(
5078 			    ip6i->ip6i_flags, ipp->ipp_use_min_mtu);
5079 		}
5080 
5081 		if (option_exists & IPPF_NEXTHOP) {
5082 			tipp = ANCIL_OR_STICKY_PTR(IPPF_NEXTHOP);
5083 			ASSERT(!IN6_IS_ADDR_UNSPECIFIED(&tipp->ipp_nexthop));
5084 			ip6i->ip6i_flags |= IP6I_NEXTHOP;
5085 			ip6i->ip6i_nexthop = tipp->ipp_nexthop;
5086 		}
5087 
5088 		/*
5089 		 * tell IP this is an ip6i_t private header
5090 		 */
5091 		ip6i->ip6i_nxt = IPPROTO_RAW;
5092 	}
5093 
5094 	/* Initialize IPv6 header */
5095 	ip6h->ip6_vcf = IPV6_DEFAULT_VERS_AND_FLOW;
5096 	bzero(&ip6h->ip6_src, sizeof (ip6h->ip6_src));
5097 
5098 	/* Set the hoplimit of the outgoing packet. */
5099 	if (option_exists & IPPF_HOPLIMIT) {
5100 		/* IPV6_HOPLIMIT ancillary data overrides all other settings. */
5101 		ip6h->ip6_hops = ipp->ipp_hoplimit;
5102 		ip6i->ip6i_flags |= IP6I_HOPLIMIT;
5103 	} else if (IN6_IS_ADDR_MULTICAST(&sin6->sin6_addr)) {
5104 		ip6h->ip6_hops = icmp->icmp_multicast_ttl;
5105 		if (option_exists & IPPF_MULTICAST_HOPS)
5106 			ip6i->ip6i_flags |= IP6I_HOPLIMIT;
5107 	} else {
5108 		ip6h->ip6_hops = icmp->icmp_ttl;
5109 		if (option_exists & IPPF_UNICAST_HOPS)
5110 			ip6i->ip6i_flags |= IP6I_HOPLIMIT;
5111 	}
5112 
5113 	if (option_exists & IPPF_ADDR) {
5114 		tipp = ANCIL_OR_STICKY_PTR(IPPF_ADDR);
5115 		ASSERT(!IN6_IS_ADDR_UNSPECIFIED(&tipp->ipp_addr));
5116 		ip6h->ip6_src = tipp->ipp_addr;
5117 	} else {
5118 		/*
5119 		 * The source address was not set using IPV6_PKTINFO.
5120 		 * First look at the bound source.
5121 		 * If unspecified fallback to __sin6_src_id.
5122 		 */
5123 		ip6h->ip6_src = icmp->icmp_v6src;
5124 		if (sin6->__sin6_src_id != 0 &&
5125 		    IN6_IS_ADDR_UNSPECIFIED(&ip6h->ip6_src)) {
5126 			ip_srcid_find_id(sin6->__sin6_src_id,
5127 			    &ip6h->ip6_src, icmp->icmp_zoneid,
5128 			    is->is_netstack);
5129 		}
5130 	}
5131 
5132 	nxthdr_ptr = (uint8_t *)&ip6h->ip6_nxt;
5133 	cp = (uint8_t *)&ip6h[1];
5134 
5135 	/*
5136 	 * Here's where we have to start stringing together
5137 	 * any extension headers in the right order:
5138 	 * Hop-by-hop, destination, routing, and final destination opts.
5139 	 */
5140 	if (option_exists & IPPF_HOPOPTS) {
5141 		/* Hop-by-hop options */
5142 		ip6_hbh_t *hbh = (ip6_hbh_t *)cp;
5143 		tipp = ANCIL_OR_STICKY_PTR(IPPF_HOPOPTS);
5144 
5145 		*nxthdr_ptr = IPPROTO_HOPOPTS;
5146 		nxthdr_ptr = &hbh->ip6h_nxt;
5147 
5148 		bcopy(tipp->ipp_hopopts, cp, tipp->ipp_hopoptslen);
5149 		cp += tipp->ipp_hopoptslen;
5150 	}
5151 	/*
5152 	 * En-route destination options
5153 	 * Only do them if there's a routing header as well
5154 	 */
5155 	if (option_exists & IPPF_RTDSTOPTS) {
5156 		ip6_dest_t *dst = (ip6_dest_t *)cp;
5157 		tipp = ANCIL_OR_STICKY_PTR(IPPF_RTDSTOPTS);
5158 
5159 		*nxthdr_ptr = IPPROTO_DSTOPTS;
5160 		nxthdr_ptr = &dst->ip6d_nxt;
5161 
5162 		bcopy(tipp->ipp_rtdstopts, cp, tipp->ipp_rtdstoptslen);
5163 		cp += tipp->ipp_rtdstoptslen;
5164 	}
5165 	/*
5166 	 * Routing header next
5167 	 */
5168 	if (option_exists & IPPF_RTHDR) {
5169 		ip6_rthdr_t *rt = (ip6_rthdr_t *)cp;
5170 		tipp = ANCIL_OR_STICKY_PTR(IPPF_RTHDR);
5171 
5172 		*nxthdr_ptr = IPPROTO_ROUTING;
5173 		nxthdr_ptr = &rt->ip6r_nxt;
5174 
5175 		bcopy(tipp->ipp_rthdr, cp, tipp->ipp_rthdrlen);
5176 		cp += tipp->ipp_rthdrlen;
5177 	}
5178 	/*
5179 	 * Do ultimate destination options
5180 	 */
5181 	if (option_exists & IPPF_DSTOPTS) {
5182 		ip6_dest_t *dest = (ip6_dest_t *)cp;
5183 		tipp = ANCIL_OR_STICKY_PTR(IPPF_DSTOPTS);
5184 
5185 		*nxthdr_ptr = IPPROTO_DSTOPTS;
5186 		nxthdr_ptr = &dest->ip6d_nxt;
5187 
5188 		bcopy(tipp->ipp_dstopts, cp, tipp->ipp_dstoptslen);
5189 		cp += tipp->ipp_dstoptslen;
5190 	}
5191 
5192 	/*
5193 	 * Now set the last header pointer to the proto passed in
5194 	 */
5195 	ASSERT((int)(cp - (uint8_t *)ip6i) == ip_hdr_len);
5196 	*nxthdr_ptr = icmp->icmp_proto;
5197 
5198 	/*
5199 	 * Copy in the destination address
5200 	 */
5201 	ip6h->ip6_dst = ip6_dst;
5202 
5203 	ip6h->ip6_vcf =
5204 	    (IPV6_DEFAULT_VERS_AND_FLOW & IPV6_VERS_AND_FLOW_MASK) |
5205 	    (sin6->sin6_flowinfo & ~IPV6_VERS_AND_FLOW_MASK);
5206 
5207 	if (option_exists & IPPF_TCLASS) {
5208 		tipp = ANCIL_OR_STICKY_PTR(IPPF_TCLASS);
5209 		ip6h->ip6_vcf = IPV6_TCLASS_FLOW(ip6h->ip6_vcf,
5210 		    tipp->ipp_tclass);
5211 	}
5212 	if (option_exists & IPPF_RTHDR) {
5213 		ip6_rthdr_t	*rth;
5214 
5215 		/*
5216 		 * Perform any processing needed for source routing.
5217 		 * We know that all extension headers will be in the same mblk
5218 		 * as the IPv6 header.
5219 		 */
5220 		rth = ip_find_rthdr_v6(ip6h, mp->b_wptr);
5221 		if (rth != NULL && rth->ip6r_segleft != 0) {
5222 			if (rth->ip6r_type != IPV6_RTHDR_TYPE_0) {
5223 				/*
5224 				 * Drop packet - only support Type 0 routing.
5225 				 * Notify the application as well.
5226 				 */
5227 				BUMP_MIB(&is->is_rawip_mib,
5228 				    rawipOutErrors);
5229 				return (EPROTO);
5230 			}
5231 			/*
5232 			 * rth->ip6r_len is twice the number of
5233 			 * addresses in the header
5234 			 */
5235 			if (rth->ip6r_len & 0x1) {
5236 				BUMP_MIB(&is->is_rawip_mib,
5237 				    rawipOutErrors);
5238 				return (EPROTO);
5239 			}
5240 			/*
5241 			 * Shuffle the routing header and ip6_dst
5242 			 * addresses, and get the checksum difference
5243 			 * between the first hop (in ip6_dst) and
5244 			 * the destination (in the last routing hdr entry).
5245 			 */
5246 			csum = ip_massage_options_v6(ip6h, rth,
5247 			    is->is_netstack);
5248 			/*
5249 			 * Verify that the first hop isn't a mapped address.
5250 			 * Routers along the path need to do this verification
5251 			 * for subsequent hops.
5252 			 */
5253 			if (IN6_IS_ADDR_V4MAPPED(&ip6h->ip6_dst)) {
5254 				BUMP_MIB(&is->is_rawip_mib,
5255 				    rawipOutErrors);
5256 				return (EADDRNOTAVAIL);
5257 			}
5258 		}
5259 	}
5260 
5261 	ip_len = mp->b_wptr - (uchar_t *)ip6h - IPV6_HDR_LEN;
5262 	if (mp->b_cont != NULL)
5263 		ip_len += msgdsize(mp->b_cont);
5264 
5265 	/*
5266 	 * Set the length into the IP header.
5267 	 * If the length is greater than the maximum allowed by IP,
5268 	 * then free the message and return. Do not try and send it
5269 	 * as this can cause problems in layers below.
5270 	 */
5271 	if (ip_len > IP_MAXPACKET) {
5272 		BUMP_MIB(&is->is_rawip_mib, rawipOutErrors);
5273 		return (EMSGSIZE);
5274 	}
5275 	if (icmp->icmp_proto == IPPROTO_ICMPV6 || icmp->icmp_raw_checksum) {
5276 		uint_t	cksum_off;	/* From ip6i == mp->b_rptr */
5277 		uint16_t *cksum_ptr;
5278 		uint_t	ext_hdrs_len;
5279 
5280 		/* ICMPv6 must have an offset matching icmp6_cksum offset */
5281 		ASSERT(icmp->icmp_proto != IPPROTO_ICMPV6 ||
5282 		    icmp->icmp_checksum_off == 2);
5283 
5284 		/*
5285 		 * We make it easy for IP to include our pseudo header
5286 		 * by putting our length in uh_checksum, modified (if
5287 		 * we have a routing header) by the checksum difference
5288 		 * between the ultimate destination and first hop addresses.
5289 		 * Note: ICMPv6 must always checksum the packet.
5290 		 */
5291 		cksum_off = ip_hdr_len + icmp->icmp_checksum_off;
5292 		if (cksum_off + sizeof (uint16_t) > mp->b_wptr - mp->b_rptr) {
5293 			if (!pullupmsg(mp, cksum_off + sizeof (uint16_t))) {
5294 				BUMP_MIB(&is->is_rawip_mib,
5295 				    rawipOutErrors);
5296 				freemsg(mp);
5297 				return (0);
5298 			}
5299 			ip6i = (ip6i_t *)mp->b_rptr;
5300 			if (ip6i->ip6i_nxt == IPPROTO_RAW)
5301 				ip6h = (ip6_t *)&ip6i[1];
5302 			else
5303 				ip6h = (ip6_t *)ip6i;
5304 		}
5305 		/* Add payload length to checksum */
5306 		ext_hdrs_len = ip_hdr_len - IPV6_HDR_LEN -
5307 		    (int)((uchar_t *)ip6h - (uchar_t *)ip6i);
5308 		csum += htons(ip_len - ext_hdrs_len);
5309 
5310 		cksum_ptr = (uint16_t *)((uchar_t *)ip6i + cksum_off);
5311 		csum = (csum & 0xFFFF) + (csum >> 16);
5312 		*cksum_ptr = (uint16_t)csum;
5313 	}
5314 
5315 #ifdef _LITTLE_ENDIAN
5316 	ip_len = htons(ip_len);
5317 #endif
5318 	ip6h->ip6_plen = (uint16_t)ip_len;
5319 
5320 	/* We're done. Pass the packet to IP */
5321 	BUMP_MIB(&is->is_rawip_mib, rawipOutDatagrams);
5322 	ip_output_v6(icmp->icmp_connp, mp, q, IP_WPUT);
5323 	return (0);
5324 }
5325 
5326 static void
5327 icmp_wput_other(queue_t *q, mblk_t *mp)
5328 {
5329 	uchar_t	*rptr = mp->b_rptr;
5330 	struct iocblk *iocp;
5331 #define	tudr ((struct T_unitdata_req *)rptr)
5332 	conn_t	*connp = Q_TO_CONN(q);
5333 	icmp_t	*icmp = connp->conn_icmp;
5334 	icmp_stack_t *is = icmp->icmp_is;
5335 	cred_t *cr;
5336 
5337 	switch (mp->b_datap->db_type) {
5338 	case M_PROTO:
5339 	case M_PCPROTO:
5340 		if (mp->b_wptr - rptr < sizeof (t_scalar_t)) {
5341 			/*
5342 			 * If the message does not contain a PRIM_type,
5343 			 * throw it away.
5344 			 */
5345 			freemsg(mp);
5346 			return;
5347 		}
5348 		switch (((union T_primitives *)rptr)->type) {
5349 		case T_ADDR_REQ:
5350 			icmp_addr_req(q, mp);
5351 			return;
5352 		case O_T_BIND_REQ:
5353 		case T_BIND_REQ:
5354 			icmp_tpi_bind(q, mp);
5355 			return;
5356 		case T_CONN_REQ:
5357 			icmp_tpi_connect(q, mp);
5358 			return;
5359 		case T_CAPABILITY_REQ:
5360 			icmp_capability_req(q, mp);
5361 			return;
5362 		case T_INFO_REQ:
5363 			icmp_info_req(q, mp);
5364 			return;
5365 		case T_UNITDATA_REQ:
5366 			/*
5367 			 * If a T_UNITDATA_REQ gets here, the address must
5368 			 * be bad.  Valid T_UNITDATA_REQs are found above
5369 			 * and break to below this switch.
5370 			 */
5371 			icmp_ud_err(q, mp, EADDRNOTAVAIL);
5372 			return;
5373 		case T_UNBIND_REQ:
5374 			icmp_tpi_unbind(q, mp);
5375 			return;
5376 
5377 		case T_SVR4_OPTMGMT_REQ:
5378 			/*
5379 			 * All Solaris components should pass a db_credp
5380 			 * for this TPI message, hence we ASSERT.
5381 			 * But in case there is some other M_PROTO that looks
5382 			 * like a TPI message sent by some other kernel
5383 			 * component, we check and return an error.
5384 			 */
5385 			cr = msg_getcred(mp, NULL);
5386 			ASSERT(cr != NULL);
5387 			if (cr == NULL) {
5388 				icmp_err_ack(q, mp, TSYSERR, EINVAL);
5389 				return;
5390 			}
5391 
5392 			if (!snmpcom_req(q, mp, icmp_snmp_set, ip_snmp_get,
5393 			    cr)) {
5394 				/* Only IP can return anything meaningful */
5395 				(void) svr4_optcom_req(q, mp, cr,
5396 				    &icmp_opt_obj, B_TRUE);
5397 			}
5398 			return;
5399 
5400 		case T_OPTMGMT_REQ:
5401 			/*
5402 			 * All Solaris components should pass a db_credp
5403 			 * for this TPI message, hence we ASSERT.
5404 			 * But in case there is some other M_PROTO that looks
5405 			 * like a TPI message sent by some other kernel
5406 			 * component, we check and return an error.
5407 			 */
5408 			cr = msg_getcred(mp, NULL);
5409 			ASSERT(cr != NULL);
5410 			if (cr == NULL) {
5411 				icmp_err_ack(q, mp, TSYSERR, EINVAL);
5412 				return;
5413 			}
5414 			/* Only IP can return anything meaningful */
5415 			(void) tpi_optcom_req(q, mp, cr, &icmp_opt_obj, B_TRUE);
5416 			return;
5417 
5418 		case T_DISCON_REQ:
5419 			icmp_tpi_disconnect(q, mp);
5420 			return;
5421 
5422 		/* The following TPI message is not supported by icmp. */
5423 		case O_T_CONN_RES:
5424 		case T_CONN_RES:
5425 			icmp_err_ack(q, mp, TNOTSUPPORT, 0);
5426 			return;
5427 
5428 		/* The following 3 TPI requests are illegal for icmp. */
5429 		case T_DATA_REQ:
5430 		case T_EXDATA_REQ:
5431 		case T_ORDREL_REQ:
5432 			freemsg(mp);
5433 			(void) putctl1(RD(q), M_ERROR, EPROTO);
5434 			return;
5435 		default:
5436 			break;
5437 		}
5438 		break;
5439 	case M_IOCTL:
5440 		iocp = (struct iocblk *)mp->b_rptr;
5441 		switch (iocp->ioc_cmd) {
5442 		case TI_GETPEERNAME:
5443 			if (icmp->icmp_state != TS_DATA_XFER) {
5444 				/*
5445 				 * If a default destination address has not
5446 				 * been associated with the stream, then we
5447 				 * don't know the peer's name.
5448 				 */
5449 				iocp->ioc_error = ENOTCONN;
5450 		err_ret:;
5451 				iocp->ioc_count = 0;
5452 				mp->b_datap->db_type = M_IOCACK;
5453 				qreply(q, mp);
5454 				return;
5455 			}
5456 			/* FALLTHRU */
5457 		case TI_GETMYNAME:
5458 			/*
5459 			 * For TI_GETPEERNAME and TI_GETMYNAME, we first
5460 			 * need to copyin the user's strbuf structure.
5461 			 * Processing will continue in the M_IOCDATA case
5462 			 * below.
5463 			 */
5464 			mi_copyin(q, mp, NULL,
5465 			    SIZEOF_STRUCT(strbuf, iocp->ioc_flag));
5466 			return;
5467 		case ND_SET:
5468 			/* nd_getset performs the necessary error checking */
5469 		case ND_GET:
5470 			if (nd_getset(q, is->is_nd, mp)) {
5471 				qreply(q, mp);
5472 				return;
5473 			}
5474 			break;
5475 		case _SIOCSOCKFALLBACK:
5476 			/*
5477 			 * socket is falling back to be a
5478 			 * streams socket. Nothing  to do
5479 			 */
5480 			iocp->ioc_count = 0;
5481 			iocp->ioc_rval = 0;
5482 			qreply(q, mp);
5483 			return;
5484 		default:
5485 			break;
5486 		}
5487 		break;
5488 	case M_IOCDATA:
5489 		icmp_wput_iocdata(q, mp);
5490 		return;
5491 	default:
5492 		break;
5493 	}
5494 	ip_wput(q, mp);
5495 }
5496 
5497 /*
5498  * icmp_wput_iocdata is called by icmp_wput_slow to handle all M_IOCDATA
5499  * messages.
5500  */
5501 static void
5502 icmp_wput_iocdata(queue_t *q, mblk_t *mp)
5503 {
5504 	mblk_t	*mp1;
5505 	STRUCT_HANDLE(strbuf, sb);
5506 	icmp_t	*icmp;
5507 	uint_t	addrlen;
5508 	uint_t	error;
5509 
5510 	/* Make sure it is one of ours. */
5511 	switch (((struct iocblk *)mp->b_rptr)->ioc_cmd) {
5512 	case TI_GETMYNAME:
5513 	case TI_GETPEERNAME:
5514 		break;
5515 	default:
5516 		icmp = Q_TO_ICMP(q);
5517 		ip_output(icmp->icmp_connp, mp, q, IP_WPUT);
5518 		return;
5519 	}
5520 	switch (mi_copy_state(q, mp, &mp1)) {
5521 	case -1:
5522 		return;
5523 	case MI_COPY_CASE(MI_COPY_IN, 1):
5524 		break;
5525 	case MI_COPY_CASE(MI_COPY_OUT, 1):
5526 		/*
5527 		 * The address has been copied out, so now
5528 		 * copyout the strbuf.
5529 		 */
5530 		mi_copyout(q, mp);
5531 		return;
5532 	case MI_COPY_CASE(MI_COPY_OUT, 2):
5533 		/*
5534 		 * The address and strbuf have been copied out.
5535 		 * We're done, so just acknowledge the original
5536 		 * M_IOCTL.
5537 		 */
5538 		mi_copy_done(q, mp, 0);
5539 		return;
5540 	default:
5541 		/*
5542 		 * Something strange has happened, so acknowledge
5543 		 * the original M_IOCTL with an EPROTO error.
5544 		 */
5545 		mi_copy_done(q, mp, EPROTO);
5546 		return;
5547 	}
5548 	/*
5549 	 * Now we have the strbuf structure for TI_GETMYNAME
5550 	 * and TI_GETPEERNAME.  Next we copyout the requested
5551 	 * address and then we'll copyout the strbuf.
5552 	 */
5553 	STRUCT_SET_HANDLE(sb, ((struct iocblk *)mp->b_rptr)->ioc_flag,
5554 	    (void *)mp1->b_rptr);
5555 	icmp = Q_TO_ICMP(q);
5556 	if (icmp->icmp_family == AF_INET)
5557 		addrlen = sizeof (sin_t);
5558 	else
5559 		addrlen = sizeof (sin6_t);
5560 
5561 	if (STRUCT_FGET(sb, maxlen) < addrlen) {
5562 		mi_copy_done(q, mp, EINVAL);
5563 		return;
5564 	}
5565 
5566 	mp1 = mi_copyout_alloc(q, mp, STRUCT_FGETP(sb, buf), addrlen, B_TRUE);
5567 
5568 	if (mp1 == NULL)
5569 		return;
5570 
5571 	rw_enter(&icmp->icmp_rwlock, RW_READER);
5572 	switch (((struct iocblk *)mp->b_rptr)->ioc_cmd) {
5573 	case TI_GETMYNAME:
5574 		error = rawip_do_getsockname(icmp, (void *)mp1->b_rptr,
5575 		    &addrlen);
5576 		break;
5577 	case TI_GETPEERNAME:
5578 		error = rawip_do_getpeername(icmp, (void *)mp1->b_rptr,
5579 		    &addrlen);
5580 		break;
5581 	}
5582 	rw_exit(&icmp->icmp_rwlock);
5583 
5584 	if (error != 0) {
5585 		mi_copy_done(q, mp, error);
5586 	} else {
5587 		mp1->b_wptr += addrlen;
5588 		STRUCT_FSET(sb, len, addrlen);
5589 
5590 		/* Copy out the address */
5591 		mi_copyout(q, mp);
5592 	}
5593 }
5594 
5595 static int
5596 icmp_unitdata_opt_process(queue_t *q, mblk_t *mp, int *errorp,
5597     void *thisdg_attrs)
5598 {
5599 	struct T_unitdata_req *udreqp;
5600 	int is_absreq_failure;
5601 	cred_t *cr;
5602 
5603 	udreqp = (struct T_unitdata_req *)mp->b_rptr;
5604 	*errorp = 0;
5605 
5606 	/*
5607 	 * All Solaris components should pass a db_credp
5608 	 * for this TPI message, hence we ASSERT.
5609 	 * But in case there is some other M_PROTO that looks
5610 	 * like a TPI message sent by some other kernel
5611 	 * component, we check and return an error.
5612 	 */
5613 	cr = msg_getcred(mp, NULL);
5614 	ASSERT(cr != NULL);
5615 	if (cr == NULL)
5616 		return (-1);
5617 
5618 	*errorp = tpi_optcom_buf(q, mp, &udreqp->OPT_length,
5619 	    udreqp->OPT_offset, cr, &icmp_opt_obj,
5620 	    thisdg_attrs, &is_absreq_failure);
5621 
5622 	if (*errorp != 0) {
5623 		/*
5624 		 * Note: No special action needed in this
5625 		 * module for "is_absreq_failure"
5626 		 */
5627 		return (-1);		/* failure */
5628 	}
5629 	ASSERT(is_absreq_failure == 0);
5630 	return (0);	/* success */
5631 }
5632 
5633 void
5634 icmp_ddi_g_init(void)
5635 {
5636 	icmp_max_optsize = optcom_max_optsize(icmp_opt_obj.odb_opt_des_arr,
5637 	    icmp_opt_obj.odb_opt_arr_cnt);
5638 
5639 	/*
5640 	 * We want to be informed each time a stack is created or
5641 	 * destroyed in the kernel, so we can maintain the
5642 	 * set of icmp_stack_t's.
5643 	 */
5644 	netstack_register(NS_ICMP, rawip_stack_init, NULL, rawip_stack_fini);
5645 }
5646 
5647 void
5648 icmp_ddi_g_destroy(void)
5649 {
5650 	netstack_unregister(NS_ICMP);
5651 }
5652 
5653 #define	INET_NAME	"ip"
5654 
5655 /*
5656  * Initialize the ICMP stack instance.
5657  */
5658 static void *
5659 rawip_stack_init(netstackid_t stackid, netstack_t *ns)
5660 {
5661 	icmp_stack_t	*is;
5662 	icmpparam_t	*pa;
5663 	int		error = 0;
5664 	major_t		major;
5665 
5666 	is = (icmp_stack_t *)kmem_zalloc(sizeof (*is), KM_SLEEP);
5667 	is->is_netstack = ns;
5668 
5669 	pa = (icmpparam_t *)kmem_alloc(sizeof (icmp_param_arr), KM_SLEEP);
5670 	is->is_param_arr = pa;
5671 	bcopy(icmp_param_arr, is->is_param_arr, sizeof (icmp_param_arr));
5672 
5673 	(void) icmp_param_register(&is->is_nd,
5674 	    is->is_param_arr, A_CNT(icmp_param_arr));
5675 	is->is_ksp = rawip_kstat_init(stackid);
5676 
5677 	major = mod_name_to_major(INET_NAME);
5678 	error = ldi_ident_from_major(major, &is->is_ldi_ident);
5679 	ASSERT(error == 0);
5680 	return (is);
5681 }
5682 
5683 /*
5684  * Free the ICMP stack instance.
5685  */
5686 static void
5687 rawip_stack_fini(netstackid_t stackid, void *arg)
5688 {
5689 	icmp_stack_t *is = (icmp_stack_t *)arg;
5690 
5691 	nd_free(&is->is_nd);
5692 	kmem_free(is->is_param_arr, sizeof (icmp_param_arr));
5693 	is->is_param_arr = NULL;
5694 
5695 	rawip_kstat_fini(stackid, is->is_ksp);
5696 	is->is_ksp = NULL;
5697 	ldi_ident_release(is->is_ldi_ident);
5698 	kmem_free(is, sizeof (*is));
5699 }
5700 
5701 static void *
5702 rawip_kstat_init(netstackid_t stackid) {
5703 	kstat_t	*ksp;
5704 
5705 	rawip_named_kstat_t template = {
5706 		{ "inDatagrams",	KSTAT_DATA_UINT32, 0 },
5707 		{ "inCksumErrs",	KSTAT_DATA_UINT32, 0 },
5708 		{ "inErrors",		KSTAT_DATA_UINT32, 0 },
5709 		{ "outDatagrams",	KSTAT_DATA_UINT32, 0 },
5710 		{ "outErrors",		KSTAT_DATA_UINT32, 0 },
5711 	};
5712 
5713 	ksp = kstat_create_netstack("icmp", 0, "rawip", "mib2",
5714 					KSTAT_TYPE_NAMED,
5715 					NUM_OF_FIELDS(rawip_named_kstat_t),
5716 					0, stackid);
5717 	if (ksp == NULL || ksp->ks_data == NULL)
5718 		return (NULL);
5719 
5720 	bcopy(&template, ksp->ks_data, sizeof (template));
5721 	ksp->ks_update = rawip_kstat_update;
5722 	ksp->ks_private = (void *)(uintptr_t)stackid;
5723 
5724 	kstat_install(ksp);
5725 	return (ksp);
5726 }
5727 
5728 static void
5729 rawip_kstat_fini(netstackid_t stackid, kstat_t *ksp)
5730 {
5731 	if (ksp != NULL) {
5732 		ASSERT(stackid == (netstackid_t)(uintptr_t)ksp->ks_private);
5733 		kstat_delete_netstack(ksp, stackid);
5734 	}
5735 }
5736 
5737 static int
5738 rawip_kstat_update(kstat_t *ksp, int rw)
5739 {
5740 	rawip_named_kstat_t *rawipkp;
5741 	netstackid_t	stackid = (netstackid_t)(uintptr_t)ksp->ks_private;
5742 	netstack_t	*ns;
5743 	icmp_stack_t	*is;
5744 
5745 	if ((ksp == NULL) || (ksp->ks_data == NULL))
5746 		return (EIO);
5747 
5748 	if (rw == KSTAT_WRITE)
5749 		return (EACCES);
5750 
5751 	rawipkp = (rawip_named_kstat_t *)ksp->ks_data;
5752 
5753 	ns = netstack_find_by_stackid(stackid);
5754 	if (ns == NULL)
5755 		return (-1);
5756 	is = ns->netstack_icmp;
5757 	if (is == NULL) {
5758 		netstack_rele(ns);
5759 		return (-1);
5760 	}
5761 	rawipkp->inDatagrams.value.ui32 =  is->is_rawip_mib.rawipInDatagrams;
5762 	rawipkp->inCksumErrs.value.ui32 =  is->is_rawip_mib.rawipInCksumErrs;
5763 	rawipkp->inErrors.value.ui32 =	   is->is_rawip_mib.rawipInErrors;
5764 	rawipkp->outDatagrams.value.ui32 = is->is_rawip_mib.rawipOutDatagrams;
5765 	rawipkp->outErrors.value.ui32 =	   is->is_rawip_mib.rawipOutErrors;
5766 	netstack_rele(ns);
5767 	return (0);
5768 }
5769 
5770 /* ARGSUSED */
5771 int
5772 rawip_accept(sock_lower_handle_t lproto_handle,
5773     sock_lower_handle_t eproto_handle, sock_upper_handle_t sock_handle,
5774     cred_t *cr)
5775 {
5776 	return (EOPNOTSUPP);
5777 }
5778 
5779 /* ARGSUSED */
5780 int
5781 rawip_bind(sock_lower_handle_t proto_handle, struct sockaddr *sa,
5782     socklen_t len, cred_t *cr)
5783 {
5784 	conn_t  *connp = (conn_t *)proto_handle;
5785 	int error;
5786 
5787 	/* All Solaris components should pass a cred for this operation. */
5788 	ASSERT(cr != NULL);
5789 
5790 	/* Binding to a NULL address really means unbind */
5791 	if (sa == NULL)
5792 		error = rawip_do_unbind(connp);
5793 	else
5794 		error = rawip_do_bind(connp, sa, len);
5795 
5796 	if (error < 0) {
5797 		if (error == -TOUTSTATE)
5798 			error = EINVAL;
5799 		else
5800 			error = proto_tlitosyserr(-error);
5801 	}
5802 	return (error);
5803 }
5804 
5805 static int
5806 rawip_implicit_bind(conn_t *connp)
5807 {
5808 	sin6_t sin6addr;
5809 	sin_t *sin;
5810 	sin6_t *sin6;
5811 	socklen_t len;
5812 	int error;
5813 
5814 	if (connp->conn_icmp->icmp_family == AF_INET) {
5815 		len = sizeof (struct sockaddr_in);
5816 		sin = (sin_t *)&sin6addr;
5817 		*sin = sin_null;
5818 		sin->sin_family = AF_INET;
5819 		sin->sin_addr.s_addr = INADDR_ANY;
5820 	} else {
5821 		ASSERT(connp->conn_icmp->icmp_family == AF_INET6);
5822 		len = sizeof (sin6_t);
5823 		sin6 = (sin6_t *)&sin6addr;
5824 		*sin6 = sin6_null;
5825 		sin6->sin6_family = AF_INET6;
5826 		V6_SET_ZERO(sin6->sin6_addr);
5827 	}
5828 
5829 	error = rawip_do_bind(connp, (struct sockaddr *)&sin6addr, len);
5830 
5831 	return ((error < 0) ? proto_tlitosyserr(-error) : error);
5832 }
5833 
5834 static int
5835 rawip_unbind(conn_t *connp)
5836 {
5837 	int error;
5838 
5839 	error = rawip_do_unbind(connp);
5840 	if (error < 0) {
5841 		error = proto_tlitosyserr(-error);
5842 	}
5843 	return (error);
5844 }
5845 
5846 /* ARGSUSED */
5847 int
5848 rawip_listen(sock_lower_handle_t proto_handle, int backlog, cred_t *cr)
5849 {
5850 	return (EOPNOTSUPP);
5851 }
5852 
5853 /* ARGSUSED */
5854 int
5855 rawip_connect(sock_lower_handle_t proto_handle, const struct sockaddr *sa,
5856     socklen_t len, sock_connid_t *id, cred_t *cr)
5857 {
5858 	conn_t	*connp = (conn_t *)proto_handle;
5859 	icmp_t *icmp = connp->conn_icmp;
5860 	int	error;
5861 	boolean_t did_bind = B_FALSE;
5862 
5863 	/* All Solaris components should pass a cred for this operation. */
5864 	ASSERT(cr != NULL);
5865 
5866 	if (sa == NULL) {
5867 		/*
5868 		 * Disconnect
5869 		 * Make sure we are connected
5870 		 */
5871 		if (icmp->icmp_state != TS_DATA_XFER)
5872 			return (EINVAL);
5873 
5874 		error = icmp_disconnect(connp);
5875 		return (error);
5876 	}
5877 
5878 	error = proto_verify_ip_addr(icmp->icmp_family, sa, len);
5879 	if (error != 0)
5880 		return (error);
5881 
5882 	/* do an implicit bind if necessary */
5883 	if (icmp->icmp_state == TS_UNBND) {
5884 		error = rawip_implicit_bind(connp);
5885 		/*
5886 		 * We could be racing with an actual bind, in which case
5887 		 * we would see EPROTO. We cross our fingers and try
5888 		 * to connect.
5889 		 */
5890 		if (!(error == 0 || error == EPROTO))
5891 			return (error);
5892 		did_bind = B_TRUE;
5893 	}
5894 
5895 	/*
5896 	 * set SO_DGRAM_ERRIND
5897 	 */
5898 	icmp->icmp_dgram_errind = B_TRUE;
5899 
5900 	error = rawip_do_connect(connp, sa, len, cr);
5901 
5902 	if (error != 0 && did_bind) {
5903 		int unbind_err;
5904 
5905 		unbind_err = rawip_unbind(connp);
5906 		ASSERT(unbind_err == 0);
5907 	}
5908 
5909 	if (error == 0) {
5910 		*id = 0;
5911 		(*connp->conn_upcalls->su_connected)
5912 		    (connp->conn_upper_handle, 0, NULL, -1);
5913 	} else if (error < 0) {
5914 		error = proto_tlitosyserr(-error);
5915 	}
5916 	return (error);
5917 }
5918 
5919 /* ARGSUSED */
5920 int
5921 rawip_fallback(sock_lower_handle_t proto_handle, queue_t *q,
5922     boolean_t direct_sockfs, so_proto_quiesced_cb_t quiesced_cb)
5923 {
5924 	conn_t  *connp = (conn_t *)proto_handle;
5925 	icmp_t	*icmp;
5926 	struct T_capability_ack tca;
5927 	struct sockaddr_in6 laddr, faddr;
5928 	socklen_t laddrlen, faddrlen;
5929 	short opts;
5930 	struct stroptions *stropt;
5931 	mblk_t *stropt_mp;
5932 	int error;
5933 
5934 	icmp = connp->conn_icmp;
5935 
5936 	stropt_mp = allocb_wait(sizeof (*stropt), BPRI_HI, STR_NOSIG, NULL);
5937 
5938 	/*
5939 	 * setup the fallback stream that was allocated
5940 	 */
5941 	connp->conn_dev = (dev_t)RD(q)->q_ptr;
5942 	connp->conn_minor_arena = WR(q)->q_ptr;
5943 
5944 	RD(q)->q_ptr = WR(q)->q_ptr = connp;
5945 
5946 	WR(q)->q_qinfo = &icmpwinit;
5947 
5948 	connp->conn_rq = RD(q);
5949 	connp->conn_wq = WR(q);
5950 
5951 	/* Notify stream head about options before sending up data */
5952 	stropt_mp->b_datap->db_type = M_SETOPTS;
5953 	stropt_mp->b_wptr += sizeof (*stropt);
5954 	stropt = (struct stroptions *)stropt_mp->b_rptr;
5955 	stropt->so_flags = SO_WROFF | SO_HIWAT;
5956 	stropt->so_wroff =
5957 	    (ushort_t)(icmp->icmp_max_hdr_len + icmp->icmp_is->is_wroff_extra);
5958 	stropt->so_hiwat = icmp->icmp_recv_hiwat;
5959 	putnext(RD(q), stropt_mp);
5960 
5961 	/*
5962 	 * free helper stream
5963 	 */
5964 	ip_free_helper_stream(connp);
5965 
5966 	/*
5967 	 * Collect the information needed to sync with the sonode
5968 	 */
5969 	icmp_do_capability_ack(icmp, &tca, TC1_INFO);
5970 
5971 	laddrlen = faddrlen = sizeof (sin6_t);
5972 	(void) rawip_getsockname((sock_lower_handle_t)connp,
5973 	    (struct sockaddr *)&laddr, &laddrlen, CRED());
5974 	error = rawip_getpeername((sock_lower_handle_t)connp,
5975 	    (struct sockaddr *)&faddr, &faddrlen, CRED());
5976 	if (error != 0)
5977 		faddrlen = 0;
5978 	opts = 0;
5979 	if (icmp->icmp_dgram_errind)
5980 		opts |= SO_DGRAM_ERRIND;
5981 	if (icmp->icmp_dontroute)
5982 		opts |= SO_DONTROUTE;
5983 
5984 	(*quiesced_cb)(connp->conn_upper_handle, q, &tca,
5985 	    (struct sockaddr *)&laddr, laddrlen,
5986 	    (struct sockaddr *)&faddr, faddrlen, opts);
5987 
5988 	/*
5989 	 * Attempts to send data up during fallback will result in it being
5990 	 * queued in udp_t. Now we push up any queued packets.
5991 	 */
5992 	mutex_enter(&icmp->icmp_recv_lock);
5993 	while (icmp->icmp_fallback_queue_head != NULL) {
5994 		mblk_t	*mp;
5995 
5996 		mp = icmp->icmp_fallback_queue_head;
5997 		icmp->icmp_fallback_queue_head = mp->b_next;
5998 		mp->b_next = NULL;
5999 		mutex_exit(&icmp->icmp_recv_lock);
6000 		putnext(RD(q), mp);
6001 		mutex_enter(&icmp->icmp_recv_lock);
6002 	}
6003 	icmp->icmp_fallback_queue_tail = icmp->icmp_fallback_queue_head;
6004 
6005 	/*
6006 	 * No longer a streams less socket
6007 	 */
6008 	rw_enter(&icmp->icmp_rwlock, RW_WRITER);
6009 	connp->conn_flags &= ~IPCL_NONSTR;
6010 	rw_exit(&icmp->icmp_rwlock);
6011 
6012 	mutex_exit(&icmp->icmp_recv_lock);
6013 
6014 	ASSERT(icmp->icmp_fallback_queue_head == NULL &&
6015 	    icmp->icmp_fallback_queue_tail == NULL);
6016 
6017 	ASSERT(connp->conn_ref >= 1);
6018 
6019 	return (0);
6020 }
6021 
6022 /* ARGSUSED */
6023 sock_lower_handle_t
6024 rawip_create(int family, int type, int proto, sock_downcalls_t **sock_downcalls,
6025     uint_t *smodep, int *errorp, int flags, cred_t *credp)
6026 {
6027 	conn_t *connp;
6028 
6029 	if (type != SOCK_RAW || (family != AF_INET && family != AF_INET6)) {
6030 		*errorp = EPROTONOSUPPORT;
6031 		return (NULL);
6032 	}
6033 
6034 	connp = icmp_open(family, credp, errorp, flags);
6035 	if (connp != NULL) {
6036 		icmp_stack_t *is;
6037 
6038 		is = connp->conn_icmp->icmp_is;
6039 		connp->conn_flags |= IPCL_NONSTR;
6040 
6041 		if (connp->conn_icmp->icmp_family == AF_INET6) {
6042 			/* Build initial header template for transmit */
6043 			rw_enter(&connp->conn_icmp->icmp_rwlock, RW_WRITER);
6044 			if ((*errorp =
6045 			    icmp_build_hdrs(connp->conn_icmp)) != 0) {
6046 				rw_exit(&connp->conn_icmp->icmp_rwlock);
6047 				ipcl_conn_destroy(connp);
6048 				return (NULL);
6049 			}
6050 			rw_exit(&connp->conn_icmp->icmp_rwlock);
6051 		}
6052 
6053 		connp->conn_icmp->icmp_recv_hiwat = is->is_recv_hiwat;
6054 		connp->conn_icmp->icmp_xmit_hiwat = is->is_xmit_hiwat;
6055 
6056 		if ((*errorp = ip_create_helper_stream(connp,
6057 		    is->is_ldi_ident)) != 0) {
6058 			cmn_err(CE_CONT, "create of IP helper stream failed\n");
6059 			(void) rawip_do_close(connp);
6060 			return (NULL);
6061 		}
6062 
6063 		mutex_enter(&connp->conn_lock);
6064 		connp->conn_state_flags &= ~CONN_INCIPIENT;
6065 		mutex_exit(&connp->conn_lock);
6066 		*sock_downcalls = &sock_rawip_downcalls;
6067 		*smodep = SM_ATOMIC;
6068 	} else {
6069 		ASSERT(*errorp != 0);
6070 	}
6071 
6072 	return ((sock_lower_handle_t)connp);
6073 }
6074 
6075 /* ARGSUSED */
6076 void
6077 rawip_activate(sock_lower_handle_t proto_handle,
6078     sock_upper_handle_t sock_handle, sock_upcalls_t *sock_upcalls, int flags,
6079     cred_t *cr)
6080 {
6081 	conn_t 			*connp = (conn_t *)proto_handle;
6082 	icmp_stack_t 		*is = connp->conn_icmp->icmp_is;
6083 	struct sock_proto_props sopp;
6084 
6085 	/* All Solaris components should pass a cred for this operation. */
6086 	ASSERT(cr != NULL);
6087 
6088 	connp->conn_upcalls = sock_upcalls;
6089 	connp->conn_upper_handle = sock_handle;
6090 
6091 	sopp.sopp_flags = SOCKOPT_WROFF | SOCKOPT_RCVHIWAT | SOCKOPT_RCVLOWAT |
6092 	    SOCKOPT_MAXBLK | SOCKOPT_MAXPSZ | SOCKOPT_MINPSZ;
6093 	sopp.sopp_wroff = connp->conn_icmp->icmp_max_hdr_len +
6094 	    is->is_wroff_extra;
6095 	sopp.sopp_rxhiwat = is->is_recv_hiwat;
6096 	sopp.sopp_rxlowat = icmp_mod_info.mi_lowat;
6097 	sopp.sopp_maxblk = INFPSZ;
6098 	sopp.sopp_maxpsz = IP_MAXPACKET;
6099 	sopp.sopp_minpsz = (icmp_mod_info.mi_minpsz == 1) ? 0 :
6100 	    icmp_mod_info.mi_minpsz;
6101 
6102 	(*connp->conn_upcalls->su_set_proto_props)
6103 	    (connp->conn_upper_handle, &sopp);
6104 }
6105 
6106 static int
6107 rawip_do_getsockname(icmp_t *icmp, struct sockaddr *sa, uint_t *salenp)
6108 {
6109 	sin_t	*sin = (sin_t *)sa;
6110 	sin6_t	*sin6 = (sin6_t *)sa;
6111 
6112 	ASSERT(icmp != NULL);
6113 	ASSERT(RW_LOCK_HELD(&icmp->icmp_rwlock));
6114 
6115 	switch (icmp->icmp_family) {
6116 	case AF_INET:
6117 		ASSERT(icmp->icmp_ipversion == IPV4_VERSION);
6118 		if (*salenp < sizeof (sin_t))
6119 			return (EINVAL);
6120 
6121 		*salenp = sizeof (sin_t);
6122 		*sin = sin_null;
6123 		sin->sin_family = AF_INET;
6124 		if (icmp->icmp_state == TS_UNBND) {
6125 			break;
6126 		}
6127 
6128 		if (!IN6_IS_ADDR_V4MAPPED_ANY(&icmp->icmp_v6src) &&
6129 		    !IN6_IS_ADDR_UNSPECIFIED(&icmp->icmp_v6src)) {
6130 			sin->sin_addr.s_addr = V4_PART_OF_V6(icmp->icmp_v6src);
6131 		} else {
6132 			/*
6133 			 * INADDR_ANY
6134 			 * icmp_v6src is not set, we might be bound to
6135 			 * broadcast/multicast. Use icmp_bound_v6src as
6136 			 * local address instead (that could
6137 			 * also still be INADDR_ANY)
6138 			 */
6139 			sin->sin_addr.s_addr =
6140 			    V4_PART_OF_V6(icmp->icmp_bound_v6src);
6141 		}
6142 		break;
6143 	case AF_INET6:
6144 
6145 		if (*salenp < sizeof (sin6_t))
6146 			return (EINVAL);
6147 
6148 		*salenp = sizeof (sin6_t);
6149 		*sin6 = sin6_null;
6150 		sin6->sin6_family = AF_INET6;
6151 		if (icmp->icmp_state == TS_UNBND) {
6152 			break;
6153 		}
6154 		if (!IN6_IS_ADDR_UNSPECIFIED(&icmp->icmp_v6src)) {
6155 			sin6->sin6_addr = icmp->icmp_v6src;
6156 		} else {
6157 			/*
6158 			 * UNSPECIFIED
6159 			 * icmp_v6src is not set, we might be bound to
6160 			 * broadcast/multicast. Use icmp_bound_v6src as
6161 			 * local address instead (that could
6162 			 * also still be UNSPECIFIED)
6163 			 */
6164 
6165 			sin6->sin6_addr = icmp->icmp_bound_v6src;
6166 		}
6167 		break;
6168 	}
6169 	return (0);
6170 }
6171 
6172 static int
6173 rawip_do_getpeername(icmp_t *icmp, struct sockaddr *sa, uint_t *salenp)
6174 {
6175 	sin_t   *sin = (sin_t *)sa;
6176 	sin6_t  *sin6 = (sin6_t *)sa;
6177 
6178 	ASSERT(icmp != NULL);
6179 	ASSERT(RW_LOCK_HELD(&icmp->icmp_rwlock));
6180 
6181 	if (icmp->icmp_state != TS_DATA_XFER)
6182 		return (ENOTCONN);
6183 
6184 	sa->sa_family = icmp->icmp_family;
6185 	switch (icmp->icmp_family) {
6186 	case AF_INET:
6187 		ASSERT(icmp->icmp_ipversion == IPV4_VERSION);
6188 
6189 		if (*salenp < sizeof (sin_t))
6190 			return (EINVAL);
6191 
6192 		*salenp = sizeof (sin_t);
6193 		*sin = sin_null;
6194 		sin->sin_family = AF_INET;
6195 		sin->sin_addr.s_addr =
6196 		    V4_PART_OF_V6(icmp->icmp_v6dst.sin6_addr);
6197 		break;
6198 	case AF_INET6:
6199 		if (*salenp < sizeof (sin6_t))
6200 			return (EINVAL);
6201 
6202 		*salenp = sizeof (sin6_t);
6203 		*sin6 = sin6_null;
6204 		*sin6 = icmp->icmp_v6dst;
6205 		break;
6206 	}
6207 	return (0);
6208 }
6209 
6210 /* ARGSUSED */
6211 int
6212 rawip_getpeername(sock_lower_handle_t proto_handle, struct sockaddr *sa,
6213     socklen_t *salenp, cred_t *cr)
6214 {
6215 	conn_t  *connp = (conn_t *)proto_handle;
6216 	icmp_t  *icmp = connp->conn_icmp;
6217 	int	error;
6218 
6219 	/* All Solaris components should pass a cred for this operation. */
6220 	ASSERT(cr != NULL);
6221 
6222 	ASSERT(icmp != NULL);
6223 
6224 	rw_enter(&icmp->icmp_rwlock, RW_READER);
6225 
6226 	error = rawip_do_getpeername(icmp, sa, salenp);
6227 
6228 	rw_exit(&icmp->icmp_rwlock);
6229 
6230 	return (error);
6231 }
6232 
6233 /* ARGSUSED */
6234 int
6235 rawip_getsockname(sock_lower_handle_t proto_handle, struct sockaddr *sa,
6236     socklen_t *salenp, cred_t *cr)
6237 {
6238 	conn_t  *connp = (conn_t *)proto_handle;
6239 	icmp_t	*icmp = connp->conn_icmp;
6240 	int	error;
6241 
6242 	/* All Solaris components should pass a cred for this operation. */
6243 	ASSERT(cr != NULL);
6244 
6245 	ASSERT(icmp != NULL);
6246 	rw_enter(&icmp->icmp_rwlock, RW_READER);
6247 
6248 	error = rawip_do_getsockname(icmp, sa, salenp);
6249 
6250 	rw_exit(&icmp->icmp_rwlock);
6251 
6252 	return (error);
6253 }
6254 
6255 int
6256 rawip_setsockopt(sock_lower_handle_t proto_handle, int level, int option_name,
6257     const void *optvalp, socklen_t optlen, cred_t *cr)
6258 {
6259 	conn_t	*connp = (conn_t *)proto_handle;
6260 	icmp_t *icmp = connp->conn_icmp;
6261 	int error;
6262 
6263 	/* All Solaris components should pass a cred for this operation. */
6264 	ASSERT(cr != NULL);
6265 
6266 	error = proto_opt_check(level, option_name, optlen, NULL,
6267 	    icmp_opt_obj.odb_opt_des_arr,
6268 	    icmp_opt_obj.odb_opt_arr_cnt,
6269 	    icmp_opt_obj.odb_topmost_tpiprovider,
6270 	    B_TRUE, B_FALSE, cr);
6271 
6272 	if (error != 0) {
6273 		/*
6274 		 * option not recognized
6275 		 */
6276 		if (error < 0) {
6277 			error = proto_tlitosyserr(-error);
6278 		}
6279 		return (error);
6280 	}
6281 
6282 	rw_enter(&icmp->icmp_rwlock, RW_WRITER);
6283 	error = icmp_opt_set(connp, SETFN_OPTCOM_NEGOTIATE, level,
6284 	    option_name, optlen, (uchar_t *)optvalp, (uint_t *)&optlen,
6285 	    (uchar_t *)optvalp, NULL, cr);
6286 	rw_exit(&icmp->icmp_rwlock);
6287 
6288 	if (error < 0) {
6289 		/*
6290 		 * Pass on to ip
6291 		 */
6292 		error = ip_set_options(connp, level, option_name, optvalp,
6293 		    optlen, cr);
6294 	}
6295 
6296 	ASSERT(error >= 0);
6297 
6298 	return (error);
6299 }
6300 
6301 int
6302 rawip_getsockopt(sock_lower_handle_t proto_handle, int level, int option_name,
6303     void *optvalp, socklen_t *optlen, cred_t *cr)
6304 {
6305 	int		error;
6306 	conn_t		*connp = (conn_t *)proto_handle;
6307 	icmp_t		*icmp = connp->conn_icmp;
6308 	t_uscalar_t	max_optbuf_len;
6309 	void		*optvalp_buf;
6310 	int		len;
6311 
6312 	/* All Solaris components should pass a cred for this operation. */
6313 	ASSERT(cr != NULL);
6314 
6315 	error = proto_opt_check(level, option_name, *optlen, &max_optbuf_len,
6316 	    icmp_opt_obj.odb_opt_des_arr,
6317 	    icmp_opt_obj.odb_opt_arr_cnt,
6318 	    icmp_opt_obj.odb_topmost_tpiprovider,
6319 	    B_FALSE, B_TRUE, cr);
6320 
6321 	if (error != 0) {
6322 		if (error < 0) {
6323 			error = proto_tlitosyserr(-error);
6324 		}
6325 		return (error);
6326 	}
6327 
6328 	optvalp_buf = kmem_alloc(max_optbuf_len, KM_SLEEP);
6329 	rw_enter(&icmp->icmp_rwlock, RW_READER);
6330 	len = icmp_opt_get(connp, level, option_name, optvalp_buf);
6331 	rw_exit(&icmp->icmp_rwlock);
6332 
6333 	if (len < 0) {
6334 		/*
6335 		 * Pass on to IP
6336 		 */
6337 		kmem_free(optvalp_buf, max_optbuf_len);
6338 		return (ip_get_options(connp, level, option_name, optvalp,
6339 		    optlen, cr));
6340 	} else {
6341 		/*
6342 		 * update optlen and copy option value
6343 		 */
6344 		t_uscalar_t size = MIN(len, *optlen);
6345 		bcopy(optvalp_buf, optvalp, size);
6346 		bcopy(&size, optlen, sizeof (size));
6347 
6348 		kmem_free(optvalp_buf, max_optbuf_len);
6349 		return (0);
6350 	}
6351 }
6352 
6353 /* ARGSUSED */
6354 int
6355 rawip_close(sock_lower_handle_t proto_handle, int flags, cred_t *cr)
6356 {
6357 	conn_t	*connp = (conn_t *)proto_handle;
6358 
6359 	/* All Solaris components should pass a cred for this operation. */
6360 	ASSERT(cr != NULL);
6361 
6362 	(void) rawip_do_close(connp);
6363 	return (0);
6364 }
6365 
6366 /* ARGSUSED */
6367 int
6368 rawip_shutdown(sock_lower_handle_t proto_handle, int how, cred_t *cr)
6369 {
6370 	conn_t  *connp = (conn_t *)proto_handle;
6371 
6372 	/* All Solaris components should pass a cred for this operation. */
6373 	ASSERT(cr != NULL);
6374 
6375 	/* shut down the send side */
6376 	if (how != SHUT_RD)
6377 		(*connp->conn_upcalls->su_opctl)(connp->conn_upper_handle,
6378 		    SOCK_OPCTL_SHUT_SEND, 0);
6379 	/* shut down the recv side */
6380 	if (how != SHUT_WR)
6381 		(*connp->conn_upcalls->su_opctl)(connp->conn_upper_handle,
6382 		    SOCK_OPCTL_SHUT_RECV, 0);
6383 	return (0);
6384 }
6385 
6386 void
6387 rawip_clr_flowctrl(sock_lower_handle_t proto_handle)
6388 {
6389 	conn_t  *connp = (conn_t *)proto_handle;
6390 	icmp_t	*icmp = connp->conn_icmp;
6391 
6392 	mutex_enter(&icmp->icmp_recv_lock);
6393 	connp->conn_flow_cntrld = B_FALSE;
6394 	mutex_exit(&icmp->icmp_recv_lock);
6395 }
6396 
6397 int
6398 rawip_ioctl(sock_lower_handle_t proto_handle, int cmd, intptr_t arg,
6399     int mode, int32_t *rvalp, cred_t *cr)
6400 {
6401 	conn_t  	*connp = (conn_t *)proto_handle;
6402 	int		error;
6403 
6404 	/* All Solaris components should pass a cred for this operation. */
6405 	ASSERT(cr != NULL);
6406 
6407 	switch (cmd) {
6408 	case ND_SET:
6409 	case ND_GET:
6410 	case _SIOCSOCKFALLBACK:
6411 	case TI_GETPEERNAME:
6412 	case TI_GETMYNAME:
6413 #ifdef DEBUG
6414 		cmn_err(CE_CONT, "icmp_ioctl cmd 0x%x on non streams"
6415 		    " socket", cmd);
6416 #endif
6417 		error = EINVAL;
6418 		break;
6419 	default:
6420 		/*
6421 		 * Pass on to IP using helper stream
6422 		 */
6423 		error = ldi_ioctl(connp->conn_helper_info->iphs_handle,
6424 		    cmd, arg, mode, cr, rvalp);
6425 		break;
6426 	}
6427 	return (error);
6428 }
6429 
6430 /* ARGSUSED */
6431 int
6432 rawip_send(sock_lower_handle_t proto_handle, mblk_t *mp, struct nmsghdr *msg,
6433     cred_t *cr)
6434 {
6435 	conn_t *connp = (conn_t *)proto_handle;
6436 	icmp_t	*icmp = connp->conn_icmp;
6437 	icmp_stack_t *is = icmp->icmp_is;
6438 	int error = 0;
6439 	boolean_t bypass_dgram_errind = B_FALSE;
6440 
6441 	ASSERT(DB_TYPE(mp) == M_DATA);
6442 
6443 	/* All Solaris components should pass a cred for this operation. */
6444 	ASSERT(cr != NULL);
6445 
6446 	/* If labeled then sockfs should have already set db_credp */
6447 	ASSERT(!is_system_labeled() || msg_getcred(mp, NULL) != NULL);
6448 
6449 	/* do an implicit bind if necessary */
6450 	if (icmp->icmp_state == TS_UNBND) {
6451 		error = rawip_implicit_bind(connp);
6452 		/*
6453 		 * We could be racing with an actual bind, in which case
6454 		 * we would see EPROTO. We cross our fingers and try
6455 		 * to connect.
6456 		 */
6457 		if (!(error == 0 || error == EPROTO)) {
6458 			freemsg(mp);
6459 			return (error);
6460 		}
6461 	}
6462 
6463 	rw_enter(&icmp->icmp_rwlock, RW_WRITER);
6464 
6465 	if (msg->msg_name != NULL && icmp->icmp_state == TS_DATA_XFER) {
6466 		error = EISCONN;
6467 		goto done_lock;
6468 	}
6469 
6470 	switch (icmp->icmp_family) {
6471 	case AF_INET6: {
6472 		sin6_t	*sin6;
6473 		ip6_pkt_t	ipp_s;	/* For ancillary data options */
6474 		ip6_pkt_t	*ipp = &ipp_s;
6475 
6476 		sin6 = (sin6_t *)msg->msg_name;
6477 		if (sin6 != NULL) {
6478 			error = proto_verify_ip_addr(icmp->icmp_family,
6479 			    (struct sockaddr *)msg->msg_name, msg->msg_namelen);
6480 			if (error != 0) {
6481 				bypass_dgram_errind = B_TRUE;
6482 				goto done_lock;
6483 			}
6484 			if (icmp->icmp_delayed_error != 0) {
6485 				sin6_t  *sin1 = (sin6_t *)msg->msg_name;
6486 				sin6_t  *sin2 = (sin6_t *)
6487 				    &icmp->icmp_delayed_addr;
6488 
6489 				error = icmp->icmp_delayed_error;
6490 				icmp->icmp_delayed_error = 0;
6491 
6492 				/* Compare IP address and port */
6493 
6494 				if (sin1->sin6_port == sin2->sin6_port &&
6495 				    IN6_ARE_ADDR_EQUAL(&sin1->sin6_addr,
6496 				    &sin2->sin6_addr)) {
6497 					goto done_lock;
6498 				}
6499 			}
6500 		} else {
6501 			/*
6502 			 * Use connected address
6503 			 */
6504 			if (icmp->icmp_state != TS_DATA_XFER) {
6505 				BUMP_MIB(&is->is_rawip_mib, rawipOutErrors);
6506 				error = EDESTADDRREQ;
6507 				bypass_dgram_errind = B_TRUE;
6508 				goto done_lock;
6509 			}
6510 			sin6 = &icmp->icmp_v6dst;
6511 		}
6512 
6513 		/* No support for mapped addresses on raw sockets */
6514 		if (IN6_IS_ADDR_V4MAPPED(&sin6->sin6_addr)) {
6515 			BUMP_MIB(&is->is_rawip_mib, rawipOutErrors);
6516 			error = EADDRNOTAVAIL;
6517 			goto done_lock;
6518 		}
6519 
6520 		ipp->ipp_fields = 0;
6521 		ipp->ipp_sticky_ignored = 0;
6522 
6523 		/*
6524 		 * If options passed in, feed it for verification and handling
6525 		 */
6526 		if (msg->msg_controllen != 0) {
6527 			error = process_auxiliary_options(connp,
6528 			    msg->msg_control, msg->msg_controllen,
6529 			    ipp, &icmp_opt_obj, icmp_opt_set, cr);
6530 			if (error != 0) {
6531 				goto done_lock;
6532 			}
6533 		}
6534 
6535 		rw_exit(&icmp->icmp_rwlock);
6536 
6537 		/*
6538 		 * Destination is a native IPv6 address.
6539 		 * Send out an IPv6 format packet.
6540 		 */
6541 
6542 		error = raw_ip_send_data_v6(connp->conn_wq, connp, mp, sin6,
6543 		    ipp);
6544 	}
6545 		break;
6546 	case AF_INET: {
6547 		sin_t	*sin;
6548 		ip4_pkt_t pktinfo;
6549 		ip4_pkt_t *pktinfop = &pktinfo;
6550 		ipaddr_t	v4dst;
6551 
6552 		sin = (sin_t *)msg->msg_name;
6553 		if (sin != NULL) {
6554 			error = proto_verify_ip_addr(icmp->icmp_family,
6555 			    (struct sockaddr *)msg->msg_name, msg->msg_namelen);
6556 			if (error != 0) {
6557 				bypass_dgram_errind = B_TRUE;
6558 				goto done_lock;
6559 			}
6560 			v4dst = sin->sin_addr.s_addr;
6561 			if (icmp->icmp_delayed_error != 0) {
6562 				sin_t *sin1 = (sin_t *)msg->msg_name;
6563 				sin_t *sin2 = (sin_t *)&icmp->icmp_delayed_addr;
6564 
6565 				error = icmp->icmp_delayed_error;
6566 				icmp->icmp_delayed_error = 0;
6567 
6568 				/* Compare IP address and port */
6569 				if (sin1->sin_port == sin2->sin_port &&
6570 				    sin1->sin_addr.s_addr ==
6571 				    sin2->sin_addr.s_addr) {
6572 					goto done_lock;
6573 				}
6574 
6575 			}
6576 		} else {
6577 			/*
6578 			 * Use connected address
6579 			 */
6580 			if (icmp->icmp_state != TS_DATA_XFER) {
6581 				BUMP_MIB(&is->is_rawip_mib, rawipOutErrors);
6582 				error = EDESTADDRREQ;
6583 				bypass_dgram_errind = B_TRUE;
6584 				goto done_lock;
6585 			}
6586 			v4dst = V4_PART_OF_V6(icmp->icmp_v6dst.sin6_addr);
6587 		}
6588 
6589 
6590 		pktinfop->ip4_ill_index = 0;
6591 		pktinfop->ip4_addr = INADDR_ANY;
6592 
6593 		/*
6594 		 * If options passed in, feed it for verification and handling
6595 		 */
6596 		if (msg->msg_controllen != 0) {
6597 			error = process_auxiliary_options(connp,
6598 			    msg->msg_control, msg->msg_controllen,
6599 			    pktinfop, &icmp_opt_obj, icmp_opt_set, cr);
6600 			if (error != 0) {
6601 				goto done_lock;
6602 			}
6603 		}
6604 		rw_exit(&icmp->icmp_rwlock);
6605 
6606 		error = raw_ip_send_data_v4(connp->conn_wq, connp, mp,
6607 		    v4dst, pktinfop);
6608 		break;
6609 	}
6610 
6611 	default:
6612 		ASSERT(0);
6613 	}
6614 
6615 	goto done;
6616 
6617 done_lock:
6618 	rw_exit(&icmp->icmp_rwlock);
6619 	if (error != 0) {
6620 		ASSERT(mp != NULL);
6621 		freemsg(mp);
6622 	}
6623 done:
6624 	if (bypass_dgram_errind)
6625 		return (error);
6626 	return (icmp->icmp_dgram_errind ? error : 0);
6627 }
6628 
6629 sock_downcalls_t sock_rawip_downcalls = {
6630 	rawip_activate,
6631 	rawip_accept,
6632 	rawip_bind,
6633 	rawip_listen,
6634 	rawip_connect,
6635 	rawip_getpeername,
6636 	rawip_getsockname,
6637 	rawip_getsockopt,
6638 	rawip_setsockopt,
6639 	rawip_send,
6640 	NULL,
6641 	NULL,
6642 	NULL,
6643 	rawip_shutdown,
6644 	rawip_clr_flowctrl,
6645 	rawip_ioctl,
6646 	rawip_close
6647 };
6648