xref: /illumos-gate/usr/src/uts/common/inet/ip/icmp.c (revision c211fc479225fa54805cf480633bf6689ca9a2db)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 /* Copyright (c) 1990 Mentat Inc. */
26 
27 #include <sys/types.h>
28 #include <sys/stream.h>
29 #include <sys/stropts.h>
30 #include <sys/strlog.h>
31 #include <sys/strsun.h>
32 #define	_SUN_TPI_VERSION 2
33 #include <sys/tihdr.h>
34 #include <sys/timod.h>
35 #include <sys/ddi.h>
36 #include <sys/sunddi.h>
37 #include <sys/strsubr.h>
38 #include <sys/cmn_err.h>
39 #include <sys/debug.h>
40 #include <sys/kmem.h>
41 #include <sys/policy.h>
42 #include <sys/priv.h>
43 #include <sys/zone.h>
44 #include <sys/time.h>
45 
46 #include <sys/sockio.h>
47 #include <sys/socket.h>
48 #include <sys/socketvar.h>
49 #include <sys/isa_defs.h>
50 #include <sys/suntpi.h>
51 #include <sys/xti_inet.h>
52 #include <sys/netstack.h>
53 
54 #include <net/route.h>
55 #include <net/if.h>
56 
57 #include <netinet/in.h>
58 #include <netinet/ip6.h>
59 #include <netinet/icmp6.h>
60 #include <inet/common.h>
61 #include <inet/ip.h>
62 #include <inet/ip6.h>
63 #include <inet/proto_set.h>
64 #include <inet/nd.h>
65 #include <inet/optcom.h>
66 #include <inet/snmpcom.h>
67 #include <inet/kstatcom.h>
68 #include <inet/rawip_impl.h>
69 
70 #include <netinet/ip_mroute.h>
71 #include <inet/tcp.h>
72 #include <net/pfkeyv2.h>
73 #include <inet/ipsec_info.h>
74 #include <inet/ipclassifier.h>
75 
76 #include <sys/tsol/label.h>
77 #include <sys/tsol/tnet.h>
78 
79 #include <inet/ip_ire.h>
80 #include <inet/ip_if.h>
81 
82 #include <inet/ip_impl.h>
83 #include <sys/disp.h>
84 
85 /*
86  * Synchronization notes:
87  *
88  * RAWIP is MT and uses the usual kernel synchronization primitives. There is
89  * locks, which is icmp_rwlock. We also use conn_lock when updating things
90  * which affect the IP classifier lookup.
91  * The lock order is icmp_rwlock -> conn_lock.
92  *
93  * The icmp_rwlock:
94  * This protects most of the other fields in the icmp_t. The exact list of
95  * fields which are protected by each of the above locks is documented in
96  * the icmp_t structure definition.
97  *
98  * Plumbing notes:
99  * ICMP is always a device driver. For compatibility with mibopen() code
100  * it is possible to I_PUSH "icmp", but that results in pushing a passthrough
101  * dummy module.
102  */
103 
104 static void	icmp_addr_req(queue_t *q, mblk_t *mp);
105 static void	icmp_tpi_bind(queue_t *q, mblk_t *mp);
106 static int	icmp_bind_proto(conn_t *connp);
107 static int	icmp_build_hdrs(icmp_t *icmp);
108 static void	icmp_capability_req(queue_t *q, mblk_t *mp);
109 static int	icmp_close(queue_t *q, int flags);
110 static void	icmp_tpi_connect(queue_t *q, mblk_t *mp);
111 static void	icmp_tpi_disconnect(queue_t *q, mblk_t *mp);
112 static void	icmp_err_ack(queue_t *q, mblk_t *mp, t_scalar_t t_error,
113 		    int sys_error);
114 static void	icmp_err_ack_prim(queue_t *q, mblk_t *mp, t_scalar_t primitive,
115 		    t_scalar_t t_error, int sys_error);
116 static void	icmp_icmp_error(conn_t *connp, mblk_t *mp);
117 static void	icmp_icmp_error_ipv6(conn_t *connp, mblk_t *mp);
118 static void	icmp_info_req(queue_t *q, mblk_t *mp);
119 static void	icmp_input(void *, mblk_t *, void *);
120 static conn_t 	*icmp_open(int family, cred_t *credp, int *err, int flags);
121 static int	icmp_openv4(queue_t *q, dev_t *devp, int flag, int sflag,
122 		    cred_t *credp);
123 static int	icmp_openv6(queue_t *q, dev_t *devp, int flag, int sflag,
124 		    cred_t *credp);
125 static int	icmp_unitdata_opt_process(queue_t *q, mblk_t *mp,
126 		    int *errorp, void *thisdg_attrs);
127 static boolean_t icmp_opt_allow_udr_set(t_scalar_t level, t_scalar_t name);
128 int		icmp_opt_set(conn_t *connp, uint_t optset_context,
129 		    int level, int name, uint_t inlen,
130 		    uchar_t *invalp, uint_t *outlenp, uchar_t *outvalp,
131 		    void *thisdg_attrs, cred_t *cr);
132 int		icmp_opt_get(conn_t *connp, int level, int name,
133 		    uchar_t *ptr);
134 static int	icmp_param_get(queue_t *q, mblk_t *mp, caddr_t cp, cred_t *cr);
135 static boolean_t icmp_param_register(IDP *ndp, icmpparam_t *icmppa, int cnt);
136 static int	icmp_param_set(queue_t *q, mblk_t *mp, char *value,
137 		    caddr_t cp, cred_t *cr);
138 static int	icmp_snmp_set(queue_t *q, t_scalar_t level, t_scalar_t name,
139 		    uchar_t *ptr, int len);
140 static int	icmp_status_report(queue_t *q, mblk_t *mp, caddr_t cp,
141 		    cred_t *cr);
142 static void	icmp_ud_err(queue_t *q, mblk_t *mp, t_scalar_t err);
143 static void	icmp_tpi_unbind(queue_t *q, mblk_t *mp);
144 static int	icmp_update_label(icmp_t *icmp, mblk_t *mp, ipaddr_t dst);
145 static void	icmp_wput(queue_t *q, mblk_t *mp);
146 static void	icmp_wput_fallback(queue_t *q, mblk_t *mp);
147 static int	raw_ip_send_data_v6(queue_t *q, conn_t *connp, mblk_t *mp,
148 		    sin6_t *sin6, ip6_pkt_t *ipp);
149 static int	raw_ip_send_data_v4(queue_t *q, conn_t *connp, mblk_t *mp,
150 		    ipaddr_t v4dst, ip4_pkt_t *pktinfop);
151 static void	icmp_wput_other(queue_t *q, mblk_t *mp);
152 static void	icmp_wput_iocdata(queue_t *q, mblk_t *mp);
153 static void	icmp_wput_restricted(queue_t *q, mblk_t *mp);
154 static void	icmp_ulp_recv(conn_t *, mblk_t *);
155 
156 static void	*rawip_stack_init(netstackid_t stackid, netstack_t *ns);
157 static void	rawip_stack_fini(netstackid_t stackid, void *arg);
158 
159 static void	*rawip_kstat_init(netstackid_t stackid);
160 static void	rawip_kstat_fini(netstackid_t stackid, kstat_t *ksp);
161 static int	rawip_kstat_update(kstat_t *kp, int rw);
162 static void	rawip_stack_shutdown(netstackid_t stackid, void *arg);
163 static int	rawip_do_getsockname(icmp_t *icmp, struct sockaddr *sa,
164 		    uint_t *salenp);
165 static int	rawip_do_getpeername(icmp_t *icmp, struct sockaddr *sa,
166 		    uint_t *salenp);
167 
168 int		rawip_getsockname(sock_lower_handle_t, struct sockaddr *,
169 		    socklen_t *, cred_t *);
170 int		rawip_getpeername(sock_lower_handle_t, struct sockaddr *,
171 		    socklen_t *, cred_t *);
172 
173 static struct module_info icmp_mod_info =  {
174 	5707, "icmp", 1, INFPSZ, 512, 128
175 };
176 
177 /*
178  * Entry points for ICMP as a device.
179  * We have separate open functions for the /dev/icmp and /dev/icmp6 devices.
180  */
181 static struct qinit icmprinitv4 = {
182 	NULL, NULL, icmp_openv4, icmp_close, NULL, &icmp_mod_info
183 };
184 
185 static struct qinit icmprinitv6 = {
186 	NULL, NULL, icmp_openv6, icmp_close, NULL, &icmp_mod_info
187 };
188 
189 static struct qinit icmpwinit = {
190 	(pfi_t)icmp_wput, NULL, NULL, NULL, NULL, &icmp_mod_info
191 };
192 
193 /* ICMP entry point during fallback */
194 static struct qinit icmp_fallback_sock_winit = {
195 	(pfi_t)icmp_wput_fallback, NULL, NULL, NULL, NULL, &icmp_mod_info
196 };
197 
198 /* For AF_INET aka /dev/icmp */
199 struct streamtab icmpinfov4 = {
200 	&icmprinitv4, &icmpwinit
201 };
202 
203 /* For AF_INET6 aka /dev/icmp6 */
204 struct streamtab icmpinfov6 = {
205 	&icmprinitv6, &icmpwinit
206 };
207 
208 static sin_t	sin_null;	/* Zero address for quick clears */
209 static sin6_t	sin6_null;	/* Zero address for quick clears */
210 
211 /* Default structure copied into T_INFO_ACK messages */
212 static struct T_info_ack icmp_g_t_info_ack = {
213 	T_INFO_ACK,
214 	IP_MAXPACKET,	 /* TSDU_size.  icmp allows maximum size messages. */
215 	T_INVALID,	/* ETSDU_size.  icmp does not support expedited data. */
216 	T_INVALID,	/* CDATA_size. icmp does not support connect data. */
217 	T_INVALID,	/* DDATA_size. icmp does not support disconnect data. */
218 	0,		/* ADDR_size - filled in later. */
219 	0,		/* OPT_size - not initialized here */
220 	IP_MAXPACKET,	/* TIDU_size.  icmp allows maximum size messages. */
221 	T_CLTS,		/* SERV_type.  icmp supports connection-less. */
222 	TS_UNBND,	/* CURRENT_state.  This is set from icmp_state. */
223 	(XPG4_1|SENDZERO) /* PROVIDER_flag */
224 };
225 
226 /*
227  * Table of ND variables supported by icmp.  These are loaded into is_nd
228  * when the stack instance is created.
229  * All of these are alterable, within the min/max values given, at run time.
230  */
231 static icmpparam_t	icmp_param_arr[] = {
232 	/* min	max	value	name */
233 	{ 0,	128,	32,	"icmp_wroff_extra" },
234 	{ 1,	255,	255,	"icmp_ipv4_ttl" },
235 	{ 0, IPV6_MAX_HOPS, IPV6_DEFAULT_HOPS,	"icmp_ipv6_hoplimit"},
236 	{ 0,	1,	1,	"icmp_bsd_compat" },
237 	{ 4096,	65536,	8192,	"icmp_xmit_hiwat"},
238 	{ 0,	65536,	1024,	"icmp_xmit_lowat"},
239 	{ 4096,	65536,	8192,	"icmp_recv_hiwat"},
240 	{ 65536, 1024*1024*1024, 256*1024,	"icmp_max_buf"},
241 };
242 #define	is_wroff_extra			is_param_arr[0].icmp_param_value
243 #define	is_ipv4_ttl			is_param_arr[1].icmp_param_value
244 #define	is_ipv6_hoplimit		is_param_arr[2].icmp_param_value
245 #define	is_bsd_compat			is_param_arr[3].icmp_param_value
246 #define	is_xmit_hiwat			is_param_arr[4].icmp_param_value
247 #define	is_xmit_lowat			is_param_arr[5].icmp_param_value
248 #define	is_recv_hiwat			is_param_arr[6].icmp_param_value
249 #define	is_max_buf			is_param_arr[7].icmp_param_value
250 
251 static int rawip_do_bind(conn_t *connp, struct sockaddr *sa, socklen_t len);
252 static int rawip_do_connect(conn_t *connp, const struct sockaddr *sa,
253     socklen_t len, cred_t *cr);
254 static void rawip_post_ip_bind_connect(icmp_t *icmp, mblk_t *ire_mp, int error);
255 
256 /*
257  * This routine is called to handle each O_T_BIND_REQ/T_BIND_REQ message
258  * passed to icmp_wput.
259  * The O_T_BIND_REQ/T_BIND_REQ is passed downstream to ip with the ICMP
260  * protocol type placed in the message following the address. A T_BIND_ACK
261  * message is returned by ip_bind_v4/v6.
262  */
263 static void
264 icmp_tpi_bind(queue_t *q, mblk_t *mp)
265 {
266 	int	error;
267 	struct sockaddr *sa;
268 	struct T_bind_req *tbr;
269 	socklen_t	len;
270 	sin_t	*sin;
271 	sin6_t	*sin6;
272 	icmp_t		*icmp;
273 	conn_t	*connp = Q_TO_CONN(q);
274 	mblk_t *mp1;
275 	cred_t *cr;
276 
277 	/*
278 	 * All Solaris components should pass a db_credp
279 	 * for this TPI message, hence we ASSERT.
280 	 * But in case there is some other M_PROTO that looks
281 	 * like a TPI message sent by some other kernel
282 	 * component, we check and return an error.
283 	 */
284 	cr = msg_getcred(mp, NULL);
285 	ASSERT(cr != NULL);
286 	if (cr == NULL) {
287 		icmp_err_ack(q, mp, TSYSERR, EINVAL);
288 		return;
289 	}
290 
291 	icmp = connp->conn_icmp;
292 	if ((mp->b_wptr - mp->b_rptr) < sizeof (*tbr)) {
293 		(void) mi_strlog(q, 1, SL_ERROR|SL_TRACE,
294 		    "icmp_bind: bad req, len %u",
295 		    (uint_t)(mp->b_wptr - mp->b_rptr));
296 		icmp_err_ack(q, mp, TPROTO, 0);
297 		return;
298 	}
299 
300 	if (icmp->icmp_state != TS_UNBND) {
301 		(void) mi_strlog(q, 1, SL_ERROR|SL_TRACE,
302 		    "icmp_bind: bad state, %d", icmp->icmp_state);
303 		icmp_err_ack(q, mp, TOUTSTATE, 0);
304 		return;
305 	}
306 
307 	/*
308 	 * Reallocate the message to make sure we have enough room for an
309 	 * address and the protocol type.
310 	 */
311 	mp1 = reallocb(mp, sizeof (struct T_bind_ack) + sizeof (sin6_t) + 1, 1);
312 	if (!mp1) {
313 		icmp_err_ack(q, mp, TSYSERR, ENOMEM);
314 		return;
315 	}
316 	mp = mp1;
317 
318 	/* Reset the message type in preparation for shipping it back. */
319 	DB_TYPE(mp) = M_PCPROTO;
320 	tbr = (struct T_bind_req *)mp->b_rptr;
321 	len = tbr->ADDR_length;
322 	switch (len) {
323 	case 0:	/* request for a generic port */
324 		tbr->ADDR_offset = sizeof (struct T_bind_req);
325 		if (icmp->icmp_family == AF_INET) {
326 			tbr->ADDR_length = sizeof (sin_t);
327 			sin = (sin_t *)&tbr[1];
328 			*sin = sin_null;
329 			sin->sin_family = AF_INET;
330 			mp->b_wptr = (uchar_t *)&sin[1];
331 			sa = (struct sockaddr *)sin;
332 			len = sizeof (sin_t);
333 		} else {
334 			ASSERT(icmp->icmp_family == AF_INET6);
335 			tbr->ADDR_length = sizeof (sin6_t);
336 			sin6 = (sin6_t *)&tbr[1];
337 			*sin6 = sin6_null;
338 			sin6->sin6_family = AF_INET6;
339 			mp->b_wptr = (uchar_t *)&sin6[1];
340 			sa = (struct sockaddr *)sin6;
341 			len = sizeof (sin6_t);
342 		}
343 		break;
344 
345 	case sizeof (sin_t):	/* Complete IPv4 address */
346 		sa = (struct sockaddr *)mi_offset_param(mp, tbr->ADDR_offset,
347 		    sizeof (sin_t));
348 		break;
349 
350 	case sizeof (sin6_t):	/* Complete IPv6 address */
351 		sa = (struct sockaddr *)mi_offset_param(mp,
352 		    tbr->ADDR_offset, sizeof (sin6_t));
353 		break;
354 
355 	default:
356 		(void) mi_strlog(q, 1, SL_ERROR|SL_TRACE,
357 		    "icmp_bind: bad ADDR_length %d", tbr->ADDR_length);
358 		icmp_err_ack(q, mp, TBADADDR, 0);
359 		return;
360 	}
361 
362 	error = rawip_do_bind(connp, sa, len);
363 done:
364 	ASSERT(mp->b_cont == NULL);
365 	if (error != 0) {
366 		if (error > 0) {
367 			icmp_err_ack(q, mp, TSYSERR, error);
368 		} else {
369 			icmp_err_ack(q, mp, -error, 0);
370 		}
371 	} else {
372 		tbr->PRIM_type = T_BIND_ACK;
373 		qreply(q, mp);
374 	}
375 }
376 
377 static int
378 rawip_do_bind(conn_t *connp, struct sockaddr *sa, socklen_t len)
379 {
380 	sin_t		*sin;
381 	sin6_t		*sin6;
382 	icmp_t		*icmp;
383 	int		error = 0;
384 	mblk_t		*ire_mp;
385 
386 
387 	icmp = connp->conn_icmp;
388 
389 	if (sa == NULL || !OK_32PTR((char *)sa)) {
390 		return (EINVAL);
391 	}
392 
393 	/*
394 	 * The state must be TS_UNBND. TPI mandates that users must send
395 	 * TPI primitives only 1 at a time and wait for the response before
396 	 * sending the next primitive.
397 	 */
398 	rw_enter(&icmp->icmp_rwlock, RW_WRITER);
399 	if (icmp->icmp_state != TS_UNBND || icmp->icmp_pending_op != -1) {
400 		error = -TOUTSTATE;
401 		goto done;
402 	}
403 
404 	ASSERT(len != 0);
405 	switch (len) {
406 	case sizeof (sin_t):    /* Complete IPv4 address */
407 		sin = (sin_t *)sa;
408 		if (sin->sin_family != AF_INET ||
409 		    icmp->icmp_family != AF_INET) {
410 			/* TSYSERR, EAFNOSUPPORT */
411 			error = EAFNOSUPPORT;
412 			goto done;
413 		}
414 		break;
415 	case sizeof (sin6_t): /* Complete IPv6 address */
416 		sin6 = (sin6_t *)sa;
417 		if (sin6->sin6_family != AF_INET6 ||
418 		    icmp->icmp_family != AF_INET6) {
419 			/* TSYSERR, EAFNOSUPPORT */
420 			error = EAFNOSUPPORT;
421 			goto done;
422 		}
423 		/* No support for mapped addresses on raw sockets */
424 		if (IN6_IS_ADDR_V4MAPPED(&sin6->sin6_addr)) {
425 			/* TSYSERR, EADDRNOTAVAIL */
426 			error = EADDRNOTAVAIL;
427 			goto done;
428 		}
429 		break;
430 
431 	default:
432 		/* TBADADDR */
433 		error = EADDRNOTAVAIL;
434 		goto done;
435 	}
436 
437 	icmp->icmp_pending_op = T_BIND_REQ;
438 	icmp->icmp_state = TS_IDLE;
439 
440 	/*
441 	 * Copy the source address into our icmp structure.  This address
442 	 * may still be zero; if so, ip will fill in the correct address
443 	 * each time an outbound packet is passed to it.
444 	 * If we are binding to a broadcast or multicast address then
445 	 * rawip_post_ip_bind_connect will clear the source address.
446 	 */
447 
448 	if (icmp->icmp_family == AF_INET) {
449 		ASSERT(sin != NULL);
450 		ASSERT(icmp->icmp_ipversion == IPV4_VERSION);
451 		IN6_IPADDR_TO_V4MAPPED(sin->sin_addr.s_addr,
452 		    &icmp->icmp_v6src);
453 		icmp->icmp_max_hdr_len = IP_SIMPLE_HDR_LENGTH +
454 		    icmp->icmp_ip_snd_options_len;
455 		icmp->icmp_bound_v6src = icmp->icmp_v6src;
456 	} else {
457 		int error;
458 
459 		ASSERT(sin6 != NULL);
460 		ASSERT(icmp->icmp_ipversion == IPV6_VERSION);
461 		icmp->icmp_v6src = sin6->sin6_addr;
462 		icmp->icmp_max_hdr_len = icmp->icmp_sticky_hdrs_len;
463 		icmp->icmp_bound_v6src = icmp->icmp_v6src;
464 
465 		/* Rebuild the header template */
466 		error = icmp_build_hdrs(icmp);
467 		if (error != 0) {
468 			icmp->icmp_pending_op = -1;
469 			/*
470 			 * TSYSERR
471 			 */
472 			goto done;
473 		}
474 	}
475 
476 	ire_mp = NULL;
477 	if (!(V6_OR_V4_INADDR_ANY(icmp->icmp_v6src))) {
478 		/*
479 		 * request an IRE if src not 0 (INADDR_ANY)
480 		 */
481 		ire_mp = allocb(sizeof (ire_t), BPRI_HI);
482 		if (ire_mp == NULL) {
483 			icmp->icmp_pending_op = -1;
484 			error = ENOMEM;
485 			goto done;
486 		}
487 		DB_TYPE(ire_mp) = IRE_DB_REQ_TYPE;
488 	}
489 done:
490 	rw_exit(&icmp->icmp_rwlock);
491 	if (error != 0)
492 		return (error);
493 
494 	if (icmp->icmp_family == AF_INET6) {
495 		error = ip_proto_bind_laddr_v6(connp, &ire_mp, icmp->icmp_proto,
496 		    &sin6->sin6_addr, sin6->sin6_port, B_TRUE);
497 	} else {
498 		error = ip_proto_bind_laddr_v4(connp, &ire_mp, icmp->icmp_proto,
499 		    sin->sin_addr.s_addr, sin->sin_port, B_TRUE);
500 	}
501 	rawip_post_ip_bind_connect(icmp, ire_mp, error);
502 	return (error);
503 }
504 
505 static void
506 rawip_post_ip_bind_connect(icmp_t *icmp, mblk_t *ire_mp, int error)
507 {
508 	rw_enter(&icmp->icmp_rwlock, RW_WRITER);
509 	if (icmp->icmp_state == TS_UNBND) {
510 		/*
511 		 * not yet bound - bind sent by icmp_bind_proto.
512 		 */
513 		rw_exit(&icmp->icmp_rwlock);
514 		return;
515 	}
516 	ASSERT(icmp->icmp_pending_op != -1);
517 	icmp->icmp_pending_op = -1;
518 
519 	if (error != 0) {
520 		if (icmp->icmp_state == TS_DATA_XFER) {
521 			/* Connect failed */
522 			/* Revert back to the bound source */
523 			icmp->icmp_v6src = icmp->icmp_bound_v6src;
524 			icmp->icmp_state = TS_IDLE;
525 			if (icmp->icmp_family == AF_INET6)
526 				(void) icmp_build_hdrs(icmp);
527 		} else {
528 			V6_SET_ZERO(icmp->icmp_v6src);
529 			V6_SET_ZERO(icmp->icmp_bound_v6src);
530 			icmp->icmp_state = TS_UNBND;
531 			if (icmp->icmp_family == AF_INET6)
532 				(void) icmp_build_hdrs(icmp);
533 		}
534 	} else {
535 		if (ire_mp != NULL && ire_mp->b_datap->db_type == IRE_DB_TYPE) {
536 			ire_t *ire;
537 
538 			ire = (ire_t *)ire_mp->b_rptr;
539 			/*
540 			 * If a broadcast/multicast address was bound set
541 			 * the source address to 0.
542 			 * This ensures no datagrams with broadcast address
543 			 * as source address are emitted (which would violate
544 			 * RFC1122 - Hosts requirements)
545 			 * Note: we get IRE_BROADCAST for IPv6
546 			 * to "mark" a multicast local address.
547 			 */
548 
549 
550 			if (ire->ire_type == IRE_BROADCAST &&
551 			    icmp->icmp_state != TS_DATA_XFER) {
552 				/*
553 				 * This was just a local bind to a
554 				 * MC/broadcast addr
555 				 */
556 				V6_SET_ZERO(icmp->icmp_v6src);
557 				if (icmp->icmp_family == AF_INET6)
558 					(void) icmp_build_hdrs(icmp);
559 			}
560 		}
561 
562 	}
563 	rw_exit(&icmp->icmp_rwlock);
564 	if (ire_mp != NULL)
565 		freeb(ire_mp);
566 }
567 
568 /*
569  * Send message to IP to just bind to the protocol.
570  */
571 static int
572 icmp_bind_proto(conn_t *connp)
573 {
574 	icmp_t	*icmp;
575 	int	error;
576 
577 	icmp = connp->conn_icmp;
578 
579 	if (icmp->icmp_family == AF_INET6)
580 		error = ip_proto_bind_laddr_v6(connp, NULL, icmp->icmp_proto,
581 		    &sin6_null.sin6_addr, 0, B_TRUE);
582 	else
583 		error = ip_proto_bind_laddr_v4(connp, NULL, icmp->icmp_proto,
584 		    sin_null.sin_addr.s_addr, 0, B_TRUE);
585 
586 	rawip_post_ip_bind_connect(icmp, NULL, error);
587 	return (error);
588 }
589 
590 static void
591 icmp_tpi_connect(queue_t *q, mblk_t *mp)
592 {
593 	conn_t	*connp = Q_TO_CONN(q);
594 	struct T_conn_req	*tcr;
595 	icmp_t	*icmp;
596 	struct sockaddr *sa;
597 	socklen_t len;
598 	int error;
599 	cred_t *cr;
600 
601 	/*
602 	 * All Solaris components should pass a db_credp
603 	 * for this TPI message, hence we ASSERT.
604 	 * But in case there is some other M_PROTO that looks
605 	 * like a TPI message sent by some other kernel
606 	 * component, we check and return an error.
607 	 */
608 	cr = msg_getcred(mp, NULL);
609 	ASSERT(cr != NULL);
610 	if (cr == NULL) {
611 		icmp_err_ack(q, mp, TSYSERR, EINVAL);
612 		return;
613 	}
614 
615 	icmp = connp->conn_icmp;
616 	tcr = (struct T_conn_req *)mp->b_rptr;
617 	/* Sanity checks */
618 	if ((mp->b_wptr - mp->b_rptr) < sizeof (struct T_conn_req)) {
619 		icmp_err_ack(q, mp, TPROTO, 0);
620 		return;
621 	}
622 
623 	if (tcr->OPT_length != 0) {
624 		icmp_err_ack(q, mp, TBADOPT, 0);
625 		return;
626 	}
627 
628 	len = tcr->DEST_length;
629 
630 	switch (len) {
631 	default:
632 		icmp_err_ack(q, mp, TBADADDR, 0);
633 		return;
634 	case sizeof (sin_t):
635 		sa = (struct sockaddr *)mi_offset_param(mp, tcr->DEST_offset,
636 		    sizeof (sin_t));
637 		break;
638 	case sizeof (sin6_t):
639 		sa = (struct sockaddr *)mi_offset_param(mp,
640 		    tcr->DEST_offset, sizeof (sin6_t));
641 		break;
642 	}
643 
644 	error = proto_verify_ip_addr(icmp->icmp_family, sa, len);
645 	if (error != 0) {
646 		icmp_err_ack(q, mp, TSYSERR, error);
647 		return;
648 	}
649 
650 	error = rawip_do_connect(connp, sa, len, cr);
651 	if (error != 0) {
652 		if (error < 0) {
653 			icmp_err_ack(q, mp, -error, 0);
654 		} else {
655 			icmp_err_ack(q, mp, 0, error);
656 		}
657 	} else {
658 		mblk_t *mp1;
659 
660 		/*
661 		 * We have to send a connection confirmation to
662 		 * keep TLI happy.
663 		 */
664 		if (icmp->icmp_family == AF_INET) {
665 			mp1 = mi_tpi_conn_con(NULL, (char *)sa,
666 			    sizeof (sin_t), NULL, 0);
667 		} else {
668 			ASSERT(icmp->icmp_family == AF_INET6);
669 			mp1 = mi_tpi_conn_con(NULL, (char *)sa,
670 			    sizeof (sin6_t), NULL, 0);
671 		}
672 		if (mp1 == NULL) {
673 			rw_exit(&icmp->icmp_rwlock);
674 			icmp_err_ack(q, mp, TSYSERR, ENOMEM);
675 			return;
676 		}
677 
678 		/*
679 		 * Send ok_ack for T_CONN_REQ
680 		 */
681 		mp = mi_tpi_ok_ack_alloc(mp);
682 		if (mp == NULL) {
683 			/* Unable to reuse the T_CONN_REQ for the ack. */
684 			freemsg(mp1);
685 			icmp_err_ack_prim(q, mp1, T_CONN_REQ, TSYSERR, ENOMEM);
686 			return;
687 		}
688 		putnext(connp->conn_rq, mp);
689 		putnext(connp->conn_rq, mp1);
690 	}
691 }
692 
693 static int
694 rawip_do_connect(conn_t *connp, const struct sockaddr *sa, socklen_t len,
695     cred_t *cr)
696 {
697 	icmp_t	*icmp;
698 	sin_t	*sin;
699 	sin6_t	*sin6;
700 	mblk_t  *ire_mp;
701 	int	error;
702 	ipaddr_t	v4dst;
703 	in6_addr_t	v6dst;
704 
705 	icmp = connp->conn_icmp;
706 
707 	if (sa == NULL || !OK_32PTR((char *)sa)) {
708 		return (EINVAL);
709 	}
710 
711 	ire_mp = allocb(sizeof (ire_t), BPRI_HI);
712 	if (ire_mp == NULL)
713 		return (ENOMEM);
714 	DB_TYPE(ire_mp) = IRE_DB_REQ_TYPE;
715 
716 
717 	ASSERT(sa != NULL && len != 0);
718 
719 	rw_enter(&icmp->icmp_rwlock, RW_WRITER);
720 	if (icmp->icmp_state == TS_UNBND || icmp->icmp_pending_op != -1) {
721 		rw_exit(&icmp->icmp_rwlock);
722 		freeb(ire_mp);
723 		return (-TOUTSTATE);
724 	}
725 
726 	switch (len) {
727 	case sizeof (sin_t):
728 		sin = (sin_t *)sa;
729 
730 		ASSERT(icmp->icmp_family == AF_INET);
731 		ASSERT(icmp->icmp_ipversion == IPV4_VERSION);
732 
733 		v4dst = sin->sin_addr.s_addr;
734 		/*
735 		 * Interpret a zero destination to mean loopback.
736 		 * Update the T_CONN_REQ (sin/sin6) since it is used to
737 		 * generate the T_CONN_CON.
738 		 */
739 		if (v4dst == INADDR_ANY) {
740 			v4dst = htonl(INADDR_LOOPBACK);
741 		}
742 
743 		IN6_IPADDR_TO_V4MAPPED(v4dst, &v6dst);
744 		ASSERT(icmp->icmp_ipversion == IPV4_VERSION);
745 		icmp->icmp_max_hdr_len = IP_SIMPLE_HDR_LENGTH +
746 		    icmp->icmp_ip_snd_options_len;
747 		icmp->icmp_v6dst.sin6_addr = v6dst;
748 		icmp->icmp_v6dst.sin6_family = AF_INET6;
749 		icmp->icmp_v6dst.sin6_flowinfo = 0;
750 		icmp->icmp_v6dst.sin6_port = 0;
751 
752 		/*
753 		 * If the destination address is multicast and
754 		 * an outgoing multicast interface has been set,
755 		 * use the address of that interface as our
756 		 * source address if no source address has been set.
757 		 */
758 		if (V4_PART_OF_V6(icmp->icmp_v6src) == INADDR_ANY &&
759 		    CLASSD(v4dst) &&
760 		    icmp->icmp_multicast_if_addr != INADDR_ANY) {
761 			IN6_IPADDR_TO_V4MAPPED(icmp->icmp_multicast_if_addr,
762 			    &icmp->icmp_v6src);
763 		}
764 		break;
765 	case sizeof (sin6_t):
766 		sin6 = (sin6_t *)sa;
767 
768 		/* No support for mapped addresses on raw sockets */
769 		if (IN6_IS_ADDR_V4MAPPED(&sin6->sin6_addr)) {
770 			rw_exit(&icmp->icmp_rwlock);
771 			freeb(ire_mp);
772 			return (EADDRNOTAVAIL);
773 		}
774 
775 		ASSERT(icmp->icmp_ipversion == IPV6_VERSION);
776 		ASSERT(icmp->icmp_family == AF_INET6);
777 
778 		icmp->icmp_max_hdr_len = icmp->icmp_sticky_hdrs_len;
779 
780 		icmp->icmp_v6dst = *sin6;
781 		icmp->icmp_v6dst.sin6_port = 0;
782 
783 		/*
784 		 * Interpret a zero destination to mean loopback.
785 		 * Update the T_CONN_REQ (sin/sin6) since it is used to
786 		 * generate the T_CONN_CON.
787 		 */
788 		if (IN6_IS_ADDR_UNSPECIFIED(&icmp->icmp_v6dst.sin6_addr)) {
789 			icmp->icmp_v6dst.sin6_addr = ipv6_loopback;
790 		}
791 		/*
792 		 * If the destination address is multicast and
793 		 * an outgoing multicast interface has been set,
794 		 * then the ip bind logic will pick the correct source
795 		 * address (i.e. matching the outgoing multicast interface).
796 		 */
797 		break;
798 	}
799 
800 	icmp->icmp_pending_op = T_CONN_REQ;
801 
802 	if (icmp->icmp_state == TS_DATA_XFER) {
803 		/* Already connected - clear out state */
804 		icmp->icmp_v6src = icmp->icmp_bound_v6src;
805 		icmp->icmp_state = TS_IDLE;
806 	}
807 
808 	icmp->icmp_state = TS_DATA_XFER;
809 	rw_exit(&icmp->icmp_rwlock);
810 
811 	if (icmp->icmp_family == AF_INET6) {
812 		error = ip_proto_bind_connected_v6(connp, &ire_mp,
813 		    icmp->icmp_proto, &icmp->icmp_v6src, 0,
814 		    &icmp->icmp_v6dst.sin6_addr,
815 		    NULL, sin6->sin6_port, B_TRUE, B_TRUE, cr);
816 	} else {
817 		error = ip_proto_bind_connected_v4(connp, &ire_mp,
818 		    icmp->icmp_proto, &V4_PART_OF_V6(icmp->icmp_v6src), 0,
819 		    V4_PART_OF_V6(icmp->icmp_v6dst.sin6_addr), sin->sin_port,
820 		    B_TRUE, B_TRUE, cr);
821 	}
822 	rawip_post_ip_bind_connect(icmp, ire_mp, error);
823 	return (error);
824 }
825 
826 static void
827 icmp_close_free(conn_t *connp)
828 {
829 	icmp_t *icmp = connp->conn_icmp;
830 
831 	/* If there are any options associated with the stream, free them. */
832 	if (icmp->icmp_ip_snd_options != NULL) {
833 		mi_free((char *)icmp->icmp_ip_snd_options);
834 		icmp->icmp_ip_snd_options = NULL;
835 		icmp->icmp_ip_snd_options_len = 0;
836 	}
837 
838 	if (icmp->icmp_filter != NULL) {
839 		kmem_free(icmp->icmp_filter, sizeof (icmp6_filter_t));
840 		icmp->icmp_filter = NULL;
841 	}
842 
843 	/* Free memory associated with sticky options */
844 	if (icmp->icmp_sticky_hdrs_len != 0) {
845 		kmem_free(icmp->icmp_sticky_hdrs,
846 		    icmp->icmp_sticky_hdrs_len);
847 		icmp->icmp_sticky_hdrs = NULL;
848 		icmp->icmp_sticky_hdrs_len = 0;
849 	}
850 	ip6_pkt_free(&icmp->icmp_sticky_ipp);
851 
852 	/*
853 	 * Clear any fields which the kmem_cache constructor clears.
854 	 * Only icmp_connp needs to be preserved.
855 	 * TBD: We should make this more efficient to avoid clearing
856 	 * everything.
857 	 */
858 	ASSERT(icmp->icmp_connp == connp);
859 	bzero(icmp, sizeof (icmp_t));
860 	icmp->icmp_connp = connp;
861 }
862 
863 static int
864 rawip_do_close(conn_t *connp)
865 {
866 	ASSERT(connp != NULL && IPCL_IS_RAWIP(connp));
867 
868 	ip_quiesce_conn(connp);
869 
870 	if (!IPCL_IS_NONSTR(connp)) {
871 		qprocsoff(connp->conn_rq);
872 	}
873 
874 	ASSERT(connp->conn_icmp->icmp_fallback_queue_head == NULL &&
875 	    connp->conn_icmp->icmp_fallback_queue_tail == NULL);
876 	icmp_close_free(connp);
877 
878 	/*
879 	 * Now we are truly single threaded on this stream, and can
880 	 * delete the things hanging off the connp, and finally the connp.
881 	 * We removed this connp from the fanout list, it cannot be
882 	 * accessed thru the fanouts, and we already waited for the
883 	 * conn_ref to drop to 0. We are already in close, so
884 	 * there cannot be any other thread from the top. qprocsoff
885 	 * has completed, and service has completed or won't run in
886 	 * future.
887 	 */
888 	ASSERT(connp->conn_ref == 1);
889 
890 	if (!IPCL_IS_NONSTR(connp)) {
891 		inet_minor_free(connp->conn_minor_arena, connp->conn_dev);
892 	} else {
893 		ip_free_helper_stream(connp);
894 	}
895 
896 	connp->conn_ref--;
897 	ipcl_conn_destroy(connp);
898 
899 	return (0);
900 }
901 
902 static int
903 icmp_close(queue_t *q, int flags)
904 {
905 	conn_t  *connp;
906 
907 	if (flags & SO_FALLBACK) {
908 		/*
909 		 * stream is being closed while in fallback
910 		 * simply free the resources that were allocated
911 		 */
912 		inet_minor_free(WR(q)->q_ptr, (dev_t)(RD(q)->q_ptr));
913 		qprocsoff(q);
914 		goto done;
915 	}
916 
917 	connp = Q_TO_CONN(q);
918 	(void) rawip_do_close(connp);
919 done:
920 	q->q_ptr = WR(q)->q_ptr = NULL;
921 	return (0);
922 }
923 
924 /*
925  * This routine handles each T_DISCON_REQ message passed to icmp
926  * as an indicating that ICMP is no longer connected. This results
927  * in sending a T_BIND_REQ to IP to restore the binding to just
928  * the local address.
929  *
930  * The disconnect completes in rawip_post_ip_bind_connect.
931  */
932 static int
933 icmp_do_disconnect(conn_t *connp)
934 {
935 	icmp_t	*icmp;
936 	mblk_t	*ire_mp;
937 	int error;
938 
939 	icmp = connp->conn_icmp;
940 	rw_enter(&icmp->icmp_rwlock, RW_WRITER);
941 	if (icmp->icmp_state != TS_DATA_XFER || icmp->icmp_pending_op != -1) {
942 		rw_exit(&icmp->icmp_rwlock);
943 		return (-TOUTSTATE);
944 	}
945 	icmp->icmp_pending_op = T_DISCON_REQ;
946 	icmp->icmp_v6src = icmp->icmp_bound_v6src;
947 	icmp->icmp_state = TS_IDLE;
948 
949 
950 	if (icmp->icmp_family == AF_INET6) {
951 		/* Rebuild the header template */
952 		error = icmp_build_hdrs(icmp);
953 		if (error != 0) {
954 			icmp->icmp_pending_op = -1;
955 			rw_exit(&icmp->icmp_rwlock);
956 			return (error);
957 		}
958 	}
959 
960 	rw_exit(&icmp->icmp_rwlock);
961 	ire_mp = allocb(sizeof (ire_t), BPRI_HI);
962 	if (ire_mp == NULL) {
963 		return (ENOMEM);
964 	}
965 
966 	if (icmp->icmp_family == AF_INET6) {
967 		error = ip_proto_bind_laddr_v6(connp, &ire_mp, icmp->icmp_proto,
968 		    &icmp->icmp_bound_v6src, 0, B_TRUE);
969 	} else {
970 
971 		error = ip_proto_bind_laddr_v4(connp, &ire_mp, icmp->icmp_proto,
972 		    V4_PART_OF_V6(icmp->icmp_bound_v6src), 0, B_TRUE);
973 	}
974 
975 	rawip_post_ip_bind_connect(icmp, ire_mp, error);
976 
977 	return (error);
978 }
979 
980 static void
981 icmp_tpi_disconnect(queue_t *q, mblk_t *mp)
982 {
983 	conn_t	*connp = Q_TO_CONN(q);
984 	int	error;
985 
986 	/*
987 	 * Allocate the largest primitive we need to send back
988 	 * T_error_ack is > than T_ok_ack
989 	 */
990 	mp = reallocb(mp, sizeof (struct T_error_ack), 1);
991 	if (mp == NULL) {
992 		/* Unable to reuse the T_DISCON_REQ for the ack. */
993 		icmp_err_ack_prim(q, mp, T_DISCON_REQ, TSYSERR, ENOMEM);
994 		return;
995 	}
996 
997 	error = icmp_do_disconnect(connp);
998 
999 	if (error != 0) {
1000 		if (error > 0) {
1001 			icmp_err_ack(q, mp, 0, error);
1002 		} else {
1003 			icmp_err_ack(q, mp, -error, 0);
1004 		}
1005 	} else {
1006 		mp = mi_tpi_ok_ack_alloc(mp);
1007 		ASSERT(mp != NULL);
1008 		qreply(q, mp);
1009 	}
1010 
1011 }
1012 
1013 static int
1014 icmp_disconnect(conn_t *connp)
1015 {
1016 	int	error;
1017 	icmp_t	*icmp = connp->conn_icmp;
1018 
1019 	icmp->icmp_dgram_errind = B_FALSE;
1020 
1021 	error = icmp_do_disconnect(connp);
1022 
1023 	if (error < 0)
1024 		error = proto_tlitosyserr(-error);
1025 	return (error);
1026 }
1027 
1028 /* This routine creates a T_ERROR_ACK message and passes it upstream. */
1029 static void
1030 icmp_err_ack(queue_t *q, mblk_t *mp, t_scalar_t t_error, int sys_error)
1031 {
1032 	if ((mp = mi_tpi_err_ack_alloc(mp, t_error, sys_error)) != NULL)
1033 		qreply(q, mp);
1034 }
1035 
1036 /* Shorthand to generate and send TPI error acks to our client */
1037 static void
1038 icmp_err_ack_prim(queue_t *q, mblk_t *mp, t_scalar_t primitive,
1039     t_scalar_t t_error, int sys_error)
1040 {
1041 	struct T_error_ack	*teackp;
1042 
1043 	if ((mp = tpi_ack_alloc(mp, sizeof (struct T_error_ack),
1044 	    M_PCPROTO, T_ERROR_ACK)) != NULL) {
1045 		teackp = (struct T_error_ack *)mp->b_rptr;
1046 		teackp->ERROR_prim = primitive;
1047 		teackp->TLI_error = t_error;
1048 		teackp->UNIX_error = sys_error;
1049 		qreply(q, mp);
1050 	}
1051 }
1052 
1053 /*
1054  * icmp_icmp_error is called by icmp_input to process ICMP
1055  * messages passed up by IP.
1056  * Generates the appropriate permanent (non-transient) errors.
1057  * Assumes that IP has pulled up everything up to and including
1058  * the ICMP header.
1059  */
1060 static void
1061 icmp_icmp_error(conn_t *connp, mblk_t *mp)
1062 {
1063 	icmph_t *icmph;
1064 	ipha_t	*ipha;
1065 	int	iph_hdr_length;
1066 	sin_t	sin;
1067 	mblk_t	*mp1;
1068 	int	error = 0;
1069 	icmp_t	*icmp = connp->conn_icmp;
1070 
1071 	ipha = (ipha_t *)mp->b_rptr;
1072 
1073 	ASSERT(OK_32PTR(mp->b_rptr));
1074 
1075 	if (IPH_HDR_VERSION(ipha) != IPV4_VERSION) {
1076 		ASSERT(IPH_HDR_VERSION(ipha) == IPV6_VERSION);
1077 		icmp_icmp_error_ipv6(connp, mp);
1078 		return;
1079 	}
1080 
1081 	/*
1082 	 * icmp does not support v4 mapped addresses
1083 	 * so we can never be here for a V6 socket
1084 	 * i.e. icmp_family == AF_INET6
1085 	 */
1086 	ASSERT((IPH_HDR_VERSION(ipha) == IPV4_VERSION) &&
1087 	    (icmp->icmp_family == AF_INET));
1088 
1089 	ASSERT(icmp->icmp_family == AF_INET);
1090 
1091 	/* Skip past the outer IP and ICMP headers */
1092 	iph_hdr_length = IPH_HDR_LENGTH(ipha);
1093 	icmph = (icmph_t *)(&mp->b_rptr[iph_hdr_length]);
1094 	ipha = (ipha_t *)&icmph[1];
1095 	iph_hdr_length = IPH_HDR_LENGTH(ipha);
1096 
1097 	switch (icmph->icmph_type) {
1098 	case ICMP_DEST_UNREACHABLE:
1099 		switch (icmph->icmph_code) {
1100 		case ICMP_FRAGMENTATION_NEEDED:
1101 			/*
1102 			 * IP has already adjusted the path MTU.
1103 			 */
1104 			break;
1105 		case ICMP_PORT_UNREACHABLE:
1106 		case ICMP_PROTOCOL_UNREACHABLE:
1107 			error = ECONNREFUSED;
1108 			break;
1109 		default:
1110 			/* Transient errors */
1111 			break;
1112 		}
1113 		break;
1114 	default:
1115 		/* Transient errors */
1116 		break;
1117 	}
1118 	if (error == 0) {
1119 		freemsg(mp);
1120 		return;
1121 	}
1122 
1123 	/*
1124 	 * Deliver T_UDERROR_IND when the application has asked for it.
1125 	 * The socket layer enables this automatically when connected.
1126 	 */
1127 	if (!icmp->icmp_dgram_errind) {
1128 		freemsg(mp);
1129 		return;
1130 	}
1131 
1132 	sin = sin_null;
1133 	sin.sin_family = AF_INET;
1134 	sin.sin_addr.s_addr = ipha->ipha_dst;
1135 
1136 	if (IPCL_IS_NONSTR(connp)) {
1137 		rw_enter(&icmp->icmp_rwlock, RW_WRITER);
1138 		if (icmp->icmp_state == TS_DATA_XFER) {
1139 			if (sin.sin_addr.s_addr ==
1140 			    V4_PART_OF_V6(icmp->icmp_v6dst.sin6_addr)) {
1141 				rw_exit(&icmp->icmp_rwlock);
1142 				(*connp->conn_upcalls->su_set_error)
1143 				    (connp->conn_upper_handle, error);
1144 				goto done;
1145 			}
1146 		} else {
1147 			icmp->icmp_delayed_error = error;
1148 			*((sin_t *)&icmp->icmp_delayed_addr) = sin;
1149 		}
1150 		rw_exit(&icmp->icmp_rwlock);
1151 	} else {
1152 		mp1 = mi_tpi_uderror_ind((char *)&sin, sizeof (sin_t), NULL,
1153 		    0, error);
1154 		if (mp1 != NULL)
1155 			putnext(connp->conn_rq, mp1);
1156 	}
1157 done:
1158 	ASSERT(!RW_ISWRITER(&icmp->icmp_rwlock));
1159 	freemsg(mp);
1160 }
1161 
1162 /*
1163  * icmp_icmp_error_ipv6 is called by icmp_icmp_error to process ICMPv6
1164  * for IPv6 packets.
1165  * Send permanent (non-transient) errors upstream.
1166  * Assumes that IP has pulled up all the extension headers as well
1167  * as the ICMPv6 header.
1168  */
1169 static void
1170 icmp_icmp_error_ipv6(conn_t *connp, mblk_t *mp)
1171 {
1172 	icmp6_t		*icmp6;
1173 	ip6_t		*ip6h, *outer_ip6h;
1174 	uint16_t	iph_hdr_length;
1175 	uint8_t		*nexthdrp;
1176 	sin6_t		sin6;
1177 	mblk_t		*mp1;
1178 	int		error = 0;
1179 	icmp_t		*icmp = connp->conn_icmp;
1180 
1181 	outer_ip6h = (ip6_t *)mp->b_rptr;
1182 	if (outer_ip6h->ip6_nxt != IPPROTO_ICMPV6)
1183 		iph_hdr_length = ip_hdr_length_v6(mp, outer_ip6h);
1184 	else
1185 		iph_hdr_length = IPV6_HDR_LEN;
1186 
1187 	icmp6 = (icmp6_t *)&mp->b_rptr[iph_hdr_length];
1188 	ip6h = (ip6_t *)&icmp6[1];
1189 	if (!ip_hdr_length_nexthdr_v6(mp, ip6h, &iph_hdr_length, &nexthdrp)) {
1190 		freemsg(mp);
1191 		return;
1192 	}
1193 
1194 	switch (icmp6->icmp6_type) {
1195 	case ICMP6_DST_UNREACH:
1196 		switch (icmp6->icmp6_code) {
1197 		case ICMP6_DST_UNREACH_NOPORT:
1198 			error = ECONNREFUSED;
1199 			break;
1200 		case ICMP6_DST_UNREACH_ADMIN:
1201 		case ICMP6_DST_UNREACH_NOROUTE:
1202 		case ICMP6_DST_UNREACH_BEYONDSCOPE:
1203 		case ICMP6_DST_UNREACH_ADDR:
1204 			/* Transient errors */
1205 			break;
1206 		default:
1207 			break;
1208 		}
1209 		break;
1210 	case ICMP6_PACKET_TOO_BIG: {
1211 		struct T_unitdata_ind	*tudi;
1212 		struct T_opthdr		*toh;
1213 		size_t			udi_size;
1214 		mblk_t			*newmp;
1215 		t_scalar_t		opt_length = sizeof (struct T_opthdr) +
1216 		    sizeof (struct ip6_mtuinfo);
1217 		sin6_t			*sin6;
1218 		struct ip6_mtuinfo	*mtuinfo;
1219 
1220 		/*
1221 		 * If the application has requested to receive path mtu
1222 		 * information, send up an empty message containing an
1223 		 * IPV6_PATHMTU ancillary data item.
1224 		 */
1225 		if (!icmp->icmp_ipv6_recvpathmtu)
1226 			break;
1227 
1228 		udi_size = sizeof (struct T_unitdata_ind) + sizeof (sin6_t) +
1229 		    opt_length;
1230 		if ((newmp = allocb(udi_size, BPRI_MED)) == NULL) {
1231 			BUMP_MIB(&icmp->icmp_is->is_rawip_mib, rawipInErrors);
1232 			break;
1233 		}
1234 
1235 		/*
1236 		 * newmp->b_cont is left to NULL on purpose.  This is an
1237 		 * empty message containing only ancillary data.
1238 		 */
1239 		newmp->b_datap->db_type = M_PROTO;
1240 		tudi = (struct T_unitdata_ind *)newmp->b_rptr;
1241 		newmp->b_wptr = (uchar_t *)tudi + udi_size;
1242 		tudi->PRIM_type = T_UNITDATA_IND;
1243 		tudi->SRC_length = sizeof (sin6_t);
1244 		tudi->SRC_offset = sizeof (struct T_unitdata_ind);
1245 		tudi->OPT_offset = tudi->SRC_offset + sizeof (sin6_t);
1246 		tudi->OPT_length = opt_length;
1247 
1248 		sin6 = (sin6_t *)&tudi[1];
1249 		bzero(sin6, sizeof (sin6_t));
1250 		sin6->sin6_family = AF_INET6;
1251 		sin6->sin6_addr = icmp->icmp_v6dst.sin6_addr;
1252 
1253 		toh = (struct T_opthdr *)&sin6[1];
1254 		toh->level = IPPROTO_IPV6;
1255 		toh->name = IPV6_PATHMTU;
1256 		toh->len = opt_length;
1257 		toh->status = 0;
1258 
1259 		mtuinfo = (struct ip6_mtuinfo *)&toh[1];
1260 		bzero(mtuinfo, sizeof (struct ip6_mtuinfo));
1261 		mtuinfo->ip6m_addr.sin6_family = AF_INET6;
1262 		mtuinfo->ip6m_addr.sin6_addr = ip6h->ip6_dst;
1263 		mtuinfo->ip6m_mtu = icmp6->icmp6_mtu;
1264 		/*
1265 		 * We've consumed everything we need from the original
1266 		 * message.  Free it, then send our empty message.
1267 		 */
1268 		freemsg(mp);
1269 		icmp_ulp_recv(connp, newmp);
1270 
1271 		return;
1272 	}
1273 	case ICMP6_TIME_EXCEEDED:
1274 		/* Transient errors */
1275 		break;
1276 	case ICMP6_PARAM_PROB:
1277 		/* If this corresponds to an ICMP_PROTOCOL_UNREACHABLE */
1278 		if (icmp6->icmp6_code == ICMP6_PARAMPROB_NEXTHEADER &&
1279 		    (uchar_t *)ip6h + icmp6->icmp6_pptr ==
1280 		    (uchar_t *)nexthdrp) {
1281 			error = ECONNREFUSED;
1282 			break;
1283 		}
1284 		break;
1285 	}
1286 	if (error == 0) {
1287 		freemsg(mp);
1288 		return;
1289 	}
1290 
1291 	/*
1292 	 * Deliver T_UDERROR_IND when the application has asked for it.
1293 	 * The socket layer enables this automatically when connected.
1294 	 */
1295 	if (!icmp->icmp_dgram_errind) {
1296 		freemsg(mp);
1297 		return;
1298 	}
1299 
1300 	sin6 = sin6_null;
1301 	sin6.sin6_family = AF_INET6;
1302 	sin6.sin6_addr = ip6h->ip6_dst;
1303 	sin6.sin6_flowinfo = ip6h->ip6_vcf & ~IPV6_VERS_AND_FLOW_MASK;
1304 
1305 	if (IPCL_IS_NONSTR(connp)) {
1306 		rw_enter(&icmp->icmp_rwlock, RW_WRITER);
1307 		if (icmp->icmp_state == TS_DATA_XFER) {
1308 			if (IN6_ARE_ADDR_EQUAL(&sin6.sin6_addr,
1309 			    &icmp->icmp_v6dst.sin6_addr)) {
1310 				rw_exit(&icmp->icmp_rwlock);
1311 				(*connp->conn_upcalls->su_set_error)
1312 				    (connp->conn_upper_handle, error);
1313 				goto done;
1314 			}
1315 		} else {
1316 			icmp->icmp_delayed_error = error;
1317 			*((sin6_t *)&icmp->icmp_delayed_addr) = sin6;
1318 		}
1319 		rw_exit(&icmp->icmp_rwlock);
1320 	} else {
1321 		mp1 = mi_tpi_uderror_ind((char *)&sin6, sizeof (sin6_t),
1322 		    NULL, 0, error);
1323 		if (mp1 != NULL)
1324 			putnext(connp->conn_rq, mp1);
1325 	}
1326 done:
1327 	ASSERT(!RW_ISWRITER(&icmp->icmp_rwlock));
1328 	freemsg(mp);
1329 }
1330 
1331 /*
1332  * This routine responds to T_ADDR_REQ messages.  It is called by icmp_wput.
1333  * The local address is filled in if endpoint is bound. The remote address
1334  * is filled in if remote address has been precified ("connected endpoint")
1335  * (The concept of connected CLTS sockets is alien to published TPI
1336  *  but we support it anyway).
1337  */
1338 static void
1339 icmp_addr_req(queue_t *q, mblk_t *mp)
1340 {
1341 	icmp_t	*icmp = Q_TO_ICMP(q);
1342 	mblk_t	*ackmp;
1343 	struct T_addr_ack *taa;
1344 
1345 	/* Make it large enough for worst case */
1346 	ackmp = reallocb(mp, sizeof (struct T_addr_ack) +
1347 	    2 * sizeof (sin6_t), 1);
1348 	if (ackmp == NULL) {
1349 		icmp_err_ack(q, mp, TSYSERR, ENOMEM);
1350 		return;
1351 	}
1352 	taa = (struct T_addr_ack *)ackmp->b_rptr;
1353 
1354 	bzero(taa, sizeof (struct T_addr_ack));
1355 	ackmp->b_wptr = (uchar_t *)&taa[1];
1356 
1357 	taa->PRIM_type = T_ADDR_ACK;
1358 	ackmp->b_datap->db_type = M_PCPROTO;
1359 	rw_enter(&icmp->icmp_rwlock, RW_READER);
1360 	/*
1361 	 * Note: Following code assumes 32 bit alignment of basic
1362 	 * data structures like sin_t and struct T_addr_ack.
1363 	 */
1364 	if (icmp->icmp_state != TS_UNBND) {
1365 		/*
1366 		 * Fill in local address
1367 		 */
1368 		taa->LOCADDR_offset = sizeof (*taa);
1369 		if (icmp->icmp_family == AF_INET) {
1370 			sin_t	*sin;
1371 
1372 			taa->LOCADDR_length = sizeof (sin_t);
1373 			sin = (sin_t *)&taa[1];
1374 			/* Fill zeroes and then intialize non-zero fields */
1375 			*sin = sin_null;
1376 			sin->sin_family = AF_INET;
1377 			if (!IN6_IS_ADDR_V4MAPPED_ANY(&icmp->icmp_v6src) &&
1378 			    !IN6_IS_ADDR_UNSPECIFIED(&icmp->icmp_v6src)) {
1379 				IN6_V4MAPPED_TO_IPADDR(&icmp->icmp_v6src,
1380 				    sin->sin_addr.s_addr);
1381 			} else {
1382 				/*
1383 				 * INADDR_ANY
1384 				 * icmp_v6src is not set, we might be bound to
1385 				 * broadcast/multicast. Use icmp_bound_v6src as
1386 				 * local address instead (that could
1387 				 * also still be INADDR_ANY)
1388 				 */
1389 				IN6_V4MAPPED_TO_IPADDR(&icmp->icmp_bound_v6src,
1390 				    sin->sin_addr.s_addr);
1391 			}
1392 			ackmp->b_wptr = (uchar_t *)&sin[1];
1393 		} else {
1394 			sin6_t	*sin6;
1395 
1396 			ASSERT(icmp->icmp_family == AF_INET6);
1397 			taa->LOCADDR_length = sizeof (sin6_t);
1398 			sin6 = (sin6_t *)&taa[1];
1399 			/* Fill zeroes and then intialize non-zero fields */
1400 			*sin6 = sin6_null;
1401 			sin6->sin6_family = AF_INET6;
1402 			if (!IN6_IS_ADDR_UNSPECIFIED(&icmp->icmp_v6src)) {
1403 				sin6->sin6_addr = icmp->icmp_v6src;
1404 			} else {
1405 				/*
1406 				 * UNSPECIFIED
1407 				 * icmp_v6src is not set, we might be bound to
1408 				 * broadcast/multicast. Use icmp_bound_v6src as
1409 				 * local address instead (that could
1410 				 * also still be UNSPECIFIED)
1411 				 */
1412 				sin6->sin6_addr = icmp->icmp_bound_v6src;
1413 			}
1414 			ackmp->b_wptr = (uchar_t *)&sin6[1];
1415 		}
1416 	}
1417 	rw_exit(&icmp->icmp_rwlock);
1418 	ASSERT(ackmp->b_wptr <= ackmp->b_datap->db_lim);
1419 	qreply(q, ackmp);
1420 }
1421 
1422 static void
1423 icmp_copy_info(struct T_info_ack *tap, icmp_t *icmp)
1424 {
1425 	*tap = icmp_g_t_info_ack;
1426 
1427 	if (icmp->icmp_family == AF_INET6)
1428 		tap->ADDR_size = sizeof (sin6_t);
1429 	else
1430 		tap->ADDR_size = sizeof (sin_t);
1431 	tap->CURRENT_state = icmp->icmp_state;
1432 	tap->OPT_size = icmp_max_optsize;
1433 }
1434 
1435 static void
1436 icmp_do_capability_ack(icmp_t *icmp, struct T_capability_ack *tcap,
1437     t_uscalar_t cap_bits1)
1438 {
1439 	tcap->CAP_bits1 = 0;
1440 
1441 	if (cap_bits1 & TC1_INFO) {
1442 		icmp_copy_info(&tcap->INFO_ack, icmp);
1443 		tcap->CAP_bits1 |= TC1_INFO;
1444 	}
1445 }
1446 
1447 /*
1448  * This routine responds to T_CAPABILITY_REQ messages.  It is called by
1449  * icmp_wput.  Much of the T_CAPABILITY_ACK information is copied from
1450  * icmp_g_t_info_ack.  The current state of the stream is copied from
1451  * icmp_state.
1452  */
1453 static void
1454 icmp_capability_req(queue_t *q, mblk_t *mp)
1455 {
1456 	icmp_t			*icmp = Q_TO_ICMP(q);
1457 	t_uscalar_t		cap_bits1;
1458 	struct T_capability_ack	*tcap;
1459 
1460 	cap_bits1 = ((struct T_capability_req *)mp->b_rptr)->CAP_bits1;
1461 
1462 	mp = tpi_ack_alloc(mp, sizeof (struct T_capability_ack),
1463 	    mp->b_datap->db_type, T_CAPABILITY_ACK);
1464 	if (!mp)
1465 		return;
1466 
1467 	tcap = (struct T_capability_ack *)mp->b_rptr;
1468 
1469 	icmp_do_capability_ack(icmp, tcap, cap_bits1);
1470 
1471 	qreply(q, mp);
1472 }
1473 
1474 /*
1475  * This routine responds to T_INFO_REQ messages.  It is called by icmp_wput.
1476  * Most of the T_INFO_ACK information is copied from icmp_g_t_info_ack.
1477  * The current state of the stream is copied from icmp_state.
1478  */
1479 static void
1480 icmp_info_req(queue_t *q, mblk_t *mp)
1481 {
1482 	icmp_t	*icmp = Q_TO_ICMP(q);
1483 
1484 	mp = tpi_ack_alloc(mp, sizeof (struct T_info_ack), M_PCPROTO,
1485 	    T_INFO_ACK);
1486 	if (!mp)
1487 		return;
1488 	icmp_copy_info((struct T_info_ack *)mp->b_rptr, icmp);
1489 	qreply(q, mp);
1490 }
1491 
1492 /* For /dev/icmp aka AF_INET open */
1493 static int
1494 icmp_tpi_open(queue_t *q, dev_t *devp, int flag, int sflag, cred_t *credp,
1495     int family)
1496 {
1497 	conn_t *connp;
1498 	dev_t	conn_dev;
1499 	icmp_stack_t *is;
1500 	int	error;
1501 
1502 	conn_dev = NULL;
1503 
1504 	/* If the stream is already open, return immediately. */
1505 	if (q->q_ptr != NULL)
1506 		return (0);
1507 
1508 	if (sflag == MODOPEN)
1509 		return (EINVAL);
1510 
1511 	/*
1512 	 * Since ICMP is not used so heavily, allocating from the small
1513 	 * arena should be sufficient.
1514 	 */
1515 	if ((conn_dev = inet_minor_alloc(ip_minor_arena_sa)) == 0) {
1516 		return (EBUSY);
1517 	}
1518 
1519 	if (flag & SO_FALLBACK) {
1520 		/*
1521 		 * Non streams socket needs a stream to fallback to
1522 		 */
1523 		RD(q)->q_ptr = (void *)conn_dev;
1524 		WR(q)->q_qinfo = &icmp_fallback_sock_winit;
1525 		WR(q)->q_ptr = (void *)ip_minor_arena_sa;
1526 		qprocson(q);
1527 		return (0);
1528 	}
1529 
1530 	connp = icmp_open(family, credp, &error, KM_SLEEP);
1531 	if (connp == NULL) {
1532 		ASSERT(error != NULL);
1533 		inet_minor_free(ip_minor_arena_sa, connp->conn_dev);
1534 		return (error);
1535 	}
1536 
1537 	*devp = makedevice(getemajor(*devp), (minor_t)conn_dev);
1538 	connp->conn_dev = conn_dev;
1539 	connp->conn_minor_arena = ip_minor_arena_sa;
1540 
1541 	is = connp->conn_icmp->icmp_is;
1542 
1543 	/*
1544 	 * Initialize the icmp_t structure for this stream.
1545 	 */
1546 	q->q_ptr = connp;
1547 	WR(q)->q_ptr = connp;
1548 	connp->conn_rq = q;
1549 	connp->conn_wq = WR(q);
1550 
1551 	if (connp->conn_icmp->icmp_family == AF_INET6) {
1552 		/* Build initial header template for transmit */
1553 		rw_enter(&connp->conn_icmp->icmp_rwlock, RW_WRITER);
1554 		if ((error = icmp_build_hdrs(connp->conn_icmp)) != 0) {
1555 			rw_exit(&connp->conn_icmp->icmp_rwlock);
1556 			inet_minor_free(ip_minor_arena_sa, connp->conn_dev);
1557 			ipcl_conn_destroy(connp);
1558 			return (error);
1559 		}
1560 		rw_exit(&connp->conn_icmp->icmp_rwlock);
1561 	}
1562 
1563 
1564 	q->q_hiwat = is->is_recv_hiwat;
1565 	WR(q)->q_hiwat = is->is_xmit_hiwat;
1566 	WR(q)->q_lowat = is->is_xmit_lowat;
1567 
1568 	qprocson(q);
1569 
1570 	/* Set the Stream head write offset. */
1571 	(void) proto_set_tx_wroff(q, connp,
1572 	    connp->conn_icmp->icmp_max_hdr_len + is->is_wroff_extra);
1573 	(void) proto_set_rx_hiwat(connp->conn_rq, connp, q->q_hiwat);
1574 
1575 	mutex_enter(&connp->conn_lock);
1576 	connp->conn_state_flags &= ~CONN_INCIPIENT;
1577 	mutex_exit(&connp->conn_lock);
1578 
1579 	return (0);
1580 }
1581 
1582 /* For /dev/icmp4 aka AF_INET open */
1583 static int
1584 icmp_openv4(queue_t *q, dev_t *devp, int flag, int sflag, cred_t *credp)
1585 {
1586 	return (icmp_tpi_open(q, devp, flag, sflag, credp, AF_INET));
1587 }
1588 
1589 /* For /dev/icmp6 aka AF_INET6 open */
1590 static int
1591 icmp_openv6(queue_t *q, dev_t *devp, int flag, int sflag, cred_t *credp)
1592 {
1593 	return (icmp_tpi_open(q, devp, flag, sflag, credp, AF_INET6));
1594 }
1595 
1596 /*
1597  * This is the open routine for icmp.  It allocates a icmp_t structure for
1598  * the stream and, on the first open of the module, creates an ND table.
1599  */
1600 /* ARGSUSED */
1601 static conn_t *
1602 icmp_open(int family, cred_t *credp, int *err, int flags)
1603 {
1604 	icmp_t	*icmp;
1605 	conn_t *connp;
1606 	zoneid_t zoneid;
1607 	netstack_t *ns;
1608 	icmp_stack_t *is;
1609 	boolean_t isv6 = B_FALSE;
1610 
1611 	*err = secpolicy_net_icmpaccess(credp);
1612 	if (*err != 0)
1613 		return (NULL);
1614 
1615 	if (family == AF_INET6)
1616 		isv6 = B_TRUE;
1617 	ns = netstack_find_by_cred(credp);
1618 	ASSERT(ns != NULL);
1619 	is = ns->netstack_icmp;
1620 	ASSERT(is != NULL);
1621 
1622 	/*
1623 	 * For exclusive stacks we set the zoneid to zero
1624 	 * to make ICMP operate as if in the global zone.
1625 	 */
1626 	if (ns->netstack_stackid != GLOBAL_NETSTACKID)
1627 		zoneid = GLOBAL_ZONEID;
1628 	else
1629 		zoneid = crgetzoneid(credp);
1630 
1631 	ASSERT(flags == KM_SLEEP || flags == KM_NOSLEEP);
1632 
1633 	connp = ipcl_conn_create(IPCL_RAWIPCONN, flags, ns);
1634 	icmp = connp->conn_icmp;
1635 	icmp->icmp_v6dst = sin6_null;
1636 
1637 	/*
1638 	 * ipcl_conn_create did a netstack_hold. Undo the hold that was
1639 	 * done by netstack_find_by_cred()
1640 	 */
1641 	netstack_rele(ns);
1642 
1643 	rw_enter(&icmp->icmp_rwlock, RW_WRITER);
1644 	ASSERT(connp->conn_ulp == IPPROTO_ICMP);
1645 	ASSERT(connp->conn_icmp == icmp);
1646 	ASSERT(icmp->icmp_connp == connp);
1647 
1648 	/* Set the initial state of the stream and the privilege status. */
1649 	icmp->icmp_state = TS_UNBND;
1650 	if (isv6) {
1651 		icmp->icmp_ipversion = IPV6_VERSION;
1652 		icmp->icmp_family = AF_INET6;
1653 		connp->conn_ulp = IPPROTO_ICMPV6;
1654 		/* May be changed by a SO_PROTOTYPE socket option. */
1655 		icmp->icmp_proto = IPPROTO_ICMPV6;
1656 		icmp->icmp_checksum_off = 2;	/* Offset for icmp6_cksum */
1657 		icmp->icmp_max_hdr_len = IPV6_HDR_LEN;
1658 		icmp->icmp_ttl = (uint8_t)is->is_ipv6_hoplimit;
1659 		connp->conn_af_isv6 = B_TRUE;
1660 		connp->conn_flags |= IPCL_ISV6;
1661 	} else {
1662 		icmp->icmp_ipversion = IPV4_VERSION;
1663 		icmp->icmp_family = AF_INET;
1664 		/* May be changed by a SO_PROTOTYPE socket option. */
1665 		icmp->icmp_proto = IPPROTO_ICMP;
1666 		icmp->icmp_max_hdr_len = IP_SIMPLE_HDR_LENGTH;
1667 		icmp->icmp_ttl = (uint8_t)is->is_ipv4_ttl;
1668 		connp->conn_af_isv6 = B_FALSE;
1669 		connp->conn_flags &= ~IPCL_ISV6;
1670 	}
1671 	icmp->icmp_multicast_ttl = IP_DEFAULT_MULTICAST_TTL;
1672 	icmp->icmp_pending_op = -1;
1673 	connp->conn_multicast_loop = IP_DEFAULT_MULTICAST_LOOP;
1674 	connp->conn_zoneid = zoneid;
1675 
1676 	/*
1677 	 * If the caller has the process-wide flag set, then default to MAC
1678 	 * exempt mode.  This allows read-down to unlabeled hosts.
1679 	 */
1680 	if (getpflags(NET_MAC_AWARE, credp) != 0)
1681 		connp->conn_mac_exempt = B_TRUE;
1682 
1683 	connp->conn_ulp_labeled = is_system_labeled();
1684 
1685 	icmp->icmp_is = is;
1686 
1687 	connp->conn_recv = icmp_input;
1688 	crhold(credp);
1689 	connp->conn_cred = credp;
1690 
1691 	rw_exit(&icmp->icmp_rwlock);
1692 
1693 	connp->conn_flow_cntrld = B_FALSE;
1694 	return (connp);
1695 }
1696 
1697 /*
1698  * Which ICMP options OK to set through T_UNITDATA_REQ...
1699  */
1700 /* ARGSUSED */
1701 static boolean_t
1702 icmp_opt_allow_udr_set(t_scalar_t level, t_scalar_t name)
1703 {
1704 	return (B_TRUE);
1705 }
1706 
1707 /*
1708  * This routine gets default values of certain options whose default
1709  * values are maintained by protcol specific code
1710  */
1711 /* ARGSUSED */
1712 int
1713 icmp_opt_default(queue_t *q, int level, int name, uchar_t *ptr)
1714 {
1715 	icmp_t *icmp = Q_TO_ICMP(q);
1716 	icmp_stack_t *is = icmp->icmp_is;
1717 	int *i1 = (int *)ptr;
1718 
1719 	switch (level) {
1720 	case IPPROTO_IP:
1721 		switch (name) {
1722 		case IP_MULTICAST_TTL:
1723 			*ptr = (uchar_t)IP_DEFAULT_MULTICAST_TTL;
1724 			return (sizeof (uchar_t));
1725 		case IP_MULTICAST_LOOP:
1726 			*ptr = (uchar_t)IP_DEFAULT_MULTICAST_LOOP;
1727 			return (sizeof (uchar_t));
1728 		}
1729 		break;
1730 	case IPPROTO_IPV6:
1731 		switch (name) {
1732 		case IPV6_MULTICAST_HOPS:
1733 			*i1 = IP_DEFAULT_MULTICAST_TTL;
1734 			return (sizeof (int));
1735 		case IPV6_MULTICAST_LOOP:
1736 			*i1 = IP_DEFAULT_MULTICAST_LOOP;
1737 			return (sizeof (int));
1738 		case IPV6_UNICAST_HOPS:
1739 			*i1 = is->is_ipv6_hoplimit;
1740 			return (sizeof (int));
1741 		}
1742 		break;
1743 	case IPPROTO_ICMPV6:
1744 		switch (name) {
1745 		case ICMP6_FILTER:
1746 			/* Make it look like "pass all" */
1747 			ICMP6_FILTER_SETPASSALL((icmp6_filter_t *)ptr);
1748 			return (sizeof (icmp6_filter_t));
1749 		}
1750 		break;
1751 	}
1752 	return (-1);
1753 }
1754 
1755 /*
1756  * This routine retrieves the current status of socket options.
1757  * It returns the size of the option retrieved.
1758  */
1759 int
1760 icmp_opt_get(conn_t *connp, int level, int name, uchar_t *ptr)
1761 {
1762 	icmp_t		*icmp = connp->conn_icmp;
1763 	icmp_stack_t	*is = icmp->icmp_is;
1764 	int		*i1 = (int *)ptr;
1765 	ip6_pkt_t	*ipp = &icmp->icmp_sticky_ipp;
1766 	int		ret = 0;
1767 
1768 	ASSERT(RW_READ_HELD(&icmp->icmp_rwlock));
1769 	switch (level) {
1770 	case SOL_SOCKET:
1771 		switch (name) {
1772 		case SO_DEBUG:
1773 			*i1 = icmp->icmp_debug;
1774 			break;
1775 		case SO_TYPE:
1776 			*i1 = SOCK_RAW;
1777 			break;
1778 		case SO_PROTOTYPE:
1779 			*i1 = icmp->icmp_proto;
1780 			break;
1781 		case SO_REUSEADDR:
1782 			*i1 = icmp->icmp_reuseaddr;
1783 			break;
1784 
1785 		/*
1786 		 * The following three items are available here,
1787 		 * but are only meaningful to IP.
1788 		 */
1789 		case SO_DONTROUTE:
1790 			*i1 = icmp->icmp_dontroute;
1791 			break;
1792 		case SO_USELOOPBACK:
1793 			*i1 = icmp->icmp_useloopback;
1794 			break;
1795 		case SO_BROADCAST:
1796 			*i1 = icmp->icmp_broadcast;
1797 			break;
1798 
1799 		case SO_SNDBUF:
1800 			ASSERT(icmp->icmp_xmit_hiwat <= INT_MAX);
1801 			*i1 = icmp->icmp_xmit_hiwat;
1802 			break;
1803 		case SO_RCVBUF:
1804 			ASSERT(icmp->icmp_recv_hiwat <= INT_MAX);
1805 			*i1 = icmp->icmp_recv_hiwat;
1806 			break;
1807 		case SO_DGRAM_ERRIND:
1808 			*i1 = icmp->icmp_dgram_errind;
1809 			break;
1810 		case SO_TIMESTAMP:
1811 			*i1 = icmp->icmp_timestamp;
1812 			break;
1813 		case SO_MAC_EXEMPT:
1814 			*i1 = connp->conn_mac_exempt;
1815 			break;
1816 		case SO_DOMAIN:
1817 			*i1 = icmp->icmp_family;
1818 			break;
1819 
1820 		/*
1821 		 * Following four not meaningful for icmp
1822 		 * Action is same as "default" to which we fallthrough
1823 		 * so we keep them in comments.
1824 		 * case SO_LINGER:
1825 		 * case SO_KEEPALIVE:
1826 		 * case SO_OOBINLINE:
1827 		 * case SO_ALLZONES:
1828 		 */
1829 		default:
1830 			ret = -1;
1831 			goto done;
1832 		}
1833 		break;
1834 	case IPPROTO_IP:
1835 		/*
1836 		 * Only allow IPv4 option processing on IPv4 sockets.
1837 		 */
1838 		if (icmp->icmp_family != AF_INET) {
1839 			ret = -1;
1840 			goto done;
1841 		}
1842 
1843 		switch (name) {
1844 		case IP_OPTIONS:
1845 		case T_IP_OPTIONS:
1846 			/* Options are passed up with each packet */
1847 			ret = 0;
1848 			goto done;
1849 		case IP_HDRINCL:
1850 			*i1 = (int)icmp->icmp_hdrincl;
1851 			break;
1852 		case IP_TOS:
1853 		case T_IP_TOS:
1854 			*i1 = (int)icmp->icmp_type_of_service;
1855 			break;
1856 		case IP_TTL:
1857 			*i1 = (int)icmp->icmp_ttl;
1858 			break;
1859 		case IP_MULTICAST_IF:
1860 			/* 0 address if not set */
1861 			*(ipaddr_t *)ptr = icmp->icmp_multicast_if_addr;
1862 			ret = sizeof (ipaddr_t);
1863 			goto done;
1864 		case IP_MULTICAST_TTL:
1865 			*(uchar_t *)ptr = icmp->icmp_multicast_ttl;
1866 			ret = sizeof (uchar_t);
1867 			goto done;
1868 		case IP_MULTICAST_LOOP:
1869 			*ptr = connp->conn_multicast_loop;
1870 			ret = sizeof (uint8_t);
1871 			goto done;
1872 		case IP_BOUND_IF:
1873 			/* Zero if not set */
1874 			*i1 = icmp->icmp_bound_if;
1875 			break;	/* goto sizeof (int) option return */
1876 		case IP_UNSPEC_SRC:
1877 			*ptr = icmp->icmp_unspec_source;
1878 			break;	/* goto sizeof (int) option return */
1879 		case IP_RECVIF:
1880 			*ptr = icmp->icmp_recvif;
1881 			break;	/* goto sizeof (int) option return */
1882 		case IP_BROADCAST_TTL:
1883 			*(uchar_t *)ptr = connp->conn_broadcast_ttl;
1884 			return (sizeof (uchar_t));
1885 		case IP_RECVPKTINFO:
1886 			/*
1887 			 * This also handles IP_PKTINFO.
1888 			 * IP_PKTINFO and IP_RECVPKTINFO have the same value.
1889 			 * Differentiation is based on the size of the argument
1890 			 * passed in.
1891 			 * This option is handled in IP which will return an
1892 			 * error for IP_PKTINFO as it's not supported as a
1893 			 * sticky option.
1894 			 */
1895 			ret = -EINVAL;
1896 			goto done;
1897 		/*
1898 		 * Cannot "get" the value of following options
1899 		 * at this level. Action is same as "default" to
1900 		 * which we fallthrough so we keep them in comments.
1901 		 *
1902 		 * case IP_ADD_MEMBERSHIP:
1903 		 * case IP_DROP_MEMBERSHIP:
1904 		 * case IP_BLOCK_SOURCE:
1905 		 * case IP_UNBLOCK_SOURCE:
1906 		 * case IP_ADD_SOURCE_MEMBERSHIP:
1907 		 * case IP_DROP_SOURCE_MEMBERSHIP:
1908 		 * case MCAST_JOIN_GROUP:
1909 		 * case MCAST_LEAVE_GROUP:
1910 		 * case MCAST_BLOCK_SOURCE:
1911 		 * case MCAST_UNBLOCK_SOURCE:
1912 		 * case MCAST_JOIN_SOURCE_GROUP:
1913 		 * case MCAST_LEAVE_SOURCE_GROUP:
1914 		 * case MRT_INIT:
1915 		 * case MRT_DONE:
1916 		 * case MRT_ADD_VIF:
1917 		 * case MRT_DEL_VIF:
1918 		 * case MRT_ADD_MFC:
1919 		 * case MRT_DEL_MFC:
1920 		 * case MRT_VERSION:
1921 		 * case MRT_ASSERT:
1922 		 * case IP_SEC_OPT:
1923 		 * case IP_NEXTHOP:
1924 		 */
1925 		default:
1926 			ret = -1;
1927 			goto done;
1928 		}
1929 		break;
1930 	case IPPROTO_IPV6:
1931 		/*
1932 		 * Only allow IPv6 option processing on native IPv6 sockets.
1933 		 */
1934 		if (icmp->icmp_family != AF_INET6) {
1935 			ret = -1;
1936 			goto done;
1937 		}
1938 		switch (name) {
1939 		case IPV6_UNICAST_HOPS:
1940 			*i1 = (unsigned int)icmp->icmp_ttl;
1941 			break;
1942 		case IPV6_MULTICAST_IF:
1943 			/* 0 index if not set */
1944 			*i1 = icmp->icmp_multicast_if_index;
1945 			break;
1946 		case IPV6_MULTICAST_HOPS:
1947 			*i1 = icmp->icmp_multicast_ttl;
1948 			break;
1949 		case IPV6_MULTICAST_LOOP:
1950 			*i1 = connp->conn_multicast_loop;
1951 			break;
1952 		case IPV6_BOUND_IF:
1953 			/* Zero if not set */
1954 			*i1 = icmp->icmp_bound_if;
1955 			break;
1956 		case IPV6_UNSPEC_SRC:
1957 			*i1 = icmp->icmp_unspec_source;
1958 			break;
1959 		case IPV6_CHECKSUM:
1960 			/*
1961 			 * Return offset or -1 if no checksum offset.
1962 			 * Does not apply to IPPROTO_ICMPV6
1963 			 */
1964 			if (icmp->icmp_proto == IPPROTO_ICMPV6) {
1965 				ret = -1;
1966 				goto done;
1967 			}
1968 
1969 			if (icmp->icmp_raw_checksum) {
1970 				*i1 = icmp->icmp_checksum_off;
1971 			} else {
1972 				*i1 = -1;
1973 			}
1974 			break;
1975 		case IPV6_JOIN_GROUP:
1976 		case IPV6_LEAVE_GROUP:
1977 		case MCAST_JOIN_GROUP:
1978 		case MCAST_LEAVE_GROUP:
1979 		case MCAST_BLOCK_SOURCE:
1980 		case MCAST_UNBLOCK_SOURCE:
1981 		case MCAST_JOIN_SOURCE_GROUP:
1982 		case MCAST_LEAVE_SOURCE_GROUP:
1983 			/* cannot "get" the value for these */
1984 			ret = -1;
1985 			goto done;
1986 		case IPV6_RECVPKTINFO:
1987 			*i1 = icmp->icmp_ip_recvpktinfo;
1988 			break;
1989 		case IPV6_RECVTCLASS:
1990 			*i1 = icmp->icmp_ipv6_recvtclass;
1991 			break;
1992 		case IPV6_RECVPATHMTU:
1993 			*i1 = icmp->icmp_ipv6_recvpathmtu;
1994 			break;
1995 		case IPV6_V6ONLY:
1996 			*i1 = 1;
1997 			break;
1998 		case IPV6_RECVHOPLIMIT:
1999 			*i1 = icmp->icmp_ipv6_recvhoplimit;
2000 			break;
2001 		case IPV6_RECVHOPOPTS:
2002 			*i1 = icmp->icmp_ipv6_recvhopopts;
2003 			break;
2004 		case IPV6_RECVDSTOPTS:
2005 			*i1 = icmp->icmp_ipv6_recvdstopts;
2006 			break;
2007 		case _OLD_IPV6_RECVDSTOPTS:
2008 			*i1 = icmp->icmp_old_ipv6_recvdstopts;
2009 			break;
2010 		case IPV6_RECVRTHDRDSTOPTS:
2011 			*i1 = icmp->icmp_ipv6_recvrtdstopts;
2012 			break;
2013 		case IPV6_RECVRTHDR:
2014 			*i1 = icmp->icmp_ipv6_recvrthdr;
2015 			break;
2016 		case IPV6_PKTINFO: {
2017 			/* XXX assumes that caller has room for max size! */
2018 			struct in6_pktinfo *pkti;
2019 
2020 			pkti = (struct in6_pktinfo *)ptr;
2021 			if (ipp->ipp_fields & IPPF_IFINDEX)
2022 				pkti->ipi6_ifindex = ipp->ipp_ifindex;
2023 			else
2024 				pkti->ipi6_ifindex = 0;
2025 			if (ipp->ipp_fields & IPPF_ADDR)
2026 				pkti->ipi6_addr = ipp->ipp_addr;
2027 			else
2028 				pkti->ipi6_addr = ipv6_all_zeros;
2029 			ret = sizeof (struct in6_pktinfo);
2030 			goto done;
2031 		}
2032 		case IPV6_NEXTHOP: {
2033 			sin6_t *sin6 = (sin6_t *)ptr;
2034 
2035 			if (!(ipp->ipp_fields & IPPF_NEXTHOP))
2036 				return (0);
2037 			*sin6 = sin6_null;
2038 			sin6->sin6_family = AF_INET6;
2039 			sin6->sin6_addr = ipp->ipp_nexthop;
2040 			ret = (sizeof (sin6_t));
2041 			goto done;
2042 		}
2043 		case IPV6_HOPOPTS:
2044 			if (!(ipp->ipp_fields & IPPF_HOPOPTS))
2045 				return (0);
2046 			if (ipp->ipp_hopoptslen <= icmp->icmp_label_len_v6)
2047 				return (0);
2048 			bcopy((char *)ipp->ipp_hopopts +
2049 			    icmp->icmp_label_len_v6, ptr,
2050 			    ipp->ipp_hopoptslen - icmp->icmp_label_len_v6);
2051 			if (icmp->icmp_label_len_v6 > 0) {
2052 				ptr[0] = ((char *)ipp->ipp_hopopts)[0];
2053 				ptr[1] = (ipp->ipp_hopoptslen -
2054 				    icmp->icmp_label_len_v6 + 7) / 8 - 1;
2055 			}
2056 			ret = (ipp->ipp_hopoptslen - icmp->icmp_label_len_v6);
2057 			goto done;
2058 		case IPV6_RTHDRDSTOPTS:
2059 			if (!(ipp->ipp_fields & IPPF_RTDSTOPTS))
2060 				return (0);
2061 			bcopy(ipp->ipp_rtdstopts, ptr, ipp->ipp_rtdstoptslen);
2062 			ret = ipp->ipp_rtdstoptslen;
2063 			goto done;
2064 		case IPV6_RTHDR:
2065 			if (!(ipp->ipp_fields & IPPF_RTHDR))
2066 				return (0);
2067 			bcopy(ipp->ipp_rthdr, ptr, ipp->ipp_rthdrlen);
2068 			ret = ipp->ipp_rthdrlen;
2069 			goto done;
2070 		case IPV6_DSTOPTS:
2071 			if (!(ipp->ipp_fields & IPPF_DSTOPTS)) {
2072 				ret = 0;
2073 				goto done;
2074 			}
2075 			bcopy(ipp->ipp_dstopts, ptr, ipp->ipp_dstoptslen);
2076 			ret = ipp->ipp_dstoptslen;
2077 			goto done;
2078 		case IPV6_PATHMTU:
2079 			if (!(ipp->ipp_fields & IPPF_PATHMTU)) {
2080 				ret = 0;
2081 			} else {
2082 				ret = ip_fill_mtuinfo(
2083 				    &icmp->icmp_v6dst.sin6_addr, 0,
2084 				    (struct ip6_mtuinfo *)ptr,
2085 				    is->is_netstack);
2086 			}
2087 			goto done;
2088 		case IPV6_TCLASS:
2089 			if (ipp->ipp_fields & IPPF_TCLASS)
2090 				*i1 = ipp->ipp_tclass;
2091 			else
2092 				*i1 = IPV6_FLOW_TCLASS(
2093 				    IPV6_DEFAULT_VERS_AND_FLOW);
2094 			break;
2095 		default:
2096 			ret = -1;
2097 			goto done;
2098 		}
2099 		break;
2100 	case IPPROTO_ICMPV6:
2101 		/*
2102 		 * Only allow IPv6 option processing on native IPv6 sockets.
2103 		 */
2104 		if (icmp->icmp_family != AF_INET6) {
2105 			ret = -1;
2106 		}
2107 
2108 		if (icmp->icmp_proto != IPPROTO_ICMPV6) {
2109 			ret = -1;
2110 		}
2111 
2112 		switch (name) {
2113 		case ICMP6_FILTER:
2114 			if (icmp->icmp_filter == NULL) {
2115 				/* Make it look like "pass all" */
2116 				ICMP6_FILTER_SETPASSALL((icmp6_filter_t *)ptr);
2117 			} else {
2118 				(void) bcopy(icmp->icmp_filter, ptr,
2119 				    sizeof (icmp6_filter_t));
2120 			}
2121 			ret = sizeof (icmp6_filter_t);
2122 			goto done;
2123 		default:
2124 			ret = -1;
2125 			goto done;
2126 		}
2127 	default:
2128 		ret = -1;
2129 		goto done;
2130 	}
2131 	ret = sizeof (int);
2132 done:
2133 	return (ret);
2134 }
2135 
2136 /*
2137  * This routine retrieves the current status of socket options.
2138  * It returns the size of the option retrieved.
2139  */
2140 int
2141 icmp_tpi_opt_get(queue_t *q, int level, int name, uchar_t *ptr)
2142 {
2143 	conn_t  *connp = Q_TO_CONN(q);
2144 	icmp_t	*icmp = connp->conn_icmp;
2145 	int 	err;
2146 
2147 	rw_enter(&icmp->icmp_rwlock, RW_READER);
2148 	err = icmp_opt_get(connp, level, name, ptr);
2149 	rw_exit(&icmp->icmp_rwlock);
2150 	return (err);
2151 }
2152 
2153 int
2154 icmp_do_opt_set(conn_t *connp, int level, int name, uint_t inlen,
2155     uchar_t *invalp, uint_t *outlenp, uchar_t *outvalp, cred_t *cr,
2156     void *thisdg_attrs, boolean_t checkonly)
2157 {
2158 
2159 	int	*i1 = (int *)invalp;
2160 	boolean_t onoff = (*i1 == 0) ? 0 : 1;
2161 	icmp_t *icmp = connp->conn_icmp;
2162 	icmp_stack_t *is = icmp->icmp_is;
2163 	int	error;
2164 
2165 	ASSERT(RW_WRITE_HELD(&icmp->icmp_rwlock));
2166 	/*
2167 	 * For fixed length options, no sanity check
2168 	 * of passed in length is done. It is assumed *_optcom_req()
2169 	 * routines do the right thing.
2170 	 */
2171 	switch (level) {
2172 	case SOL_SOCKET:
2173 		switch (name) {
2174 		case SO_DEBUG:
2175 			if (!checkonly)
2176 				icmp->icmp_debug = onoff;
2177 			break;
2178 		case SO_PROTOTYPE:
2179 			if ((*i1 & 0xFF) != IPPROTO_ICMP &&
2180 			    (*i1 & 0xFF) != IPPROTO_ICMPV6 &&
2181 			    secpolicy_net_rawaccess(cr) != 0) {
2182 				*outlenp = 0;
2183 				return (EACCES);
2184 			}
2185 			/* Can't use IPPROTO_RAW with IPv6 */
2186 			if ((*i1 & 0xFF) == IPPROTO_RAW &&
2187 			    icmp->icmp_family == AF_INET6) {
2188 				*outlenp = 0;
2189 				return (EPROTONOSUPPORT);
2190 			}
2191 			if (checkonly) {
2192 				/* T_CHECK case */
2193 				*(int *)outvalp = (*i1 & 0xFF);
2194 				break;
2195 			}
2196 			icmp->icmp_proto = *i1 & 0xFF;
2197 			if ((icmp->icmp_proto == IPPROTO_RAW ||
2198 			    icmp->icmp_proto == IPPROTO_IGMP) &&
2199 			    icmp->icmp_family == AF_INET)
2200 				icmp->icmp_hdrincl = 1;
2201 			else
2202 				icmp->icmp_hdrincl = 0;
2203 
2204 			if (icmp->icmp_family == AF_INET6 &&
2205 			    icmp->icmp_proto == IPPROTO_ICMPV6) {
2206 				/* Set offset for icmp6_cksum */
2207 				icmp->icmp_raw_checksum = 0;
2208 				icmp->icmp_checksum_off = 2;
2209 			}
2210 			if (icmp->icmp_proto == IPPROTO_UDP ||
2211 			    icmp->icmp_proto == IPPROTO_TCP ||
2212 			    icmp->icmp_proto == IPPROTO_SCTP) {
2213 				icmp->icmp_no_tp_cksum = 1;
2214 				icmp->icmp_sticky_ipp.ipp_fields |=
2215 				    IPPF_NO_CKSUM;
2216 			} else {
2217 				icmp->icmp_no_tp_cksum = 0;
2218 				icmp->icmp_sticky_ipp.ipp_fields &=
2219 				    ~IPPF_NO_CKSUM;
2220 			}
2221 
2222 			if (icmp->icmp_filter != NULL &&
2223 			    icmp->icmp_proto != IPPROTO_ICMPV6) {
2224 				kmem_free(icmp->icmp_filter,
2225 				    sizeof (icmp6_filter_t));
2226 				icmp->icmp_filter = NULL;
2227 			}
2228 
2229 			/* Rebuild the header template */
2230 			error = icmp_build_hdrs(icmp);
2231 			if (error != 0) {
2232 				*outlenp = 0;
2233 				return (error);
2234 			}
2235 
2236 			/*
2237 			 * For SCTP, we don't use icmp_bind_proto() for
2238 			 * raw socket binding.  Note that we do not need
2239 			 * to set *outlenp.
2240 			 * FIXME: how does SCTP work?
2241 			 */
2242 			if (icmp->icmp_proto == IPPROTO_SCTP)
2243 				return (0);
2244 
2245 			*outlenp = sizeof (int);
2246 			*(int *)outvalp = *i1 & 0xFF;
2247 
2248 			/* Drop lock across the bind operation */
2249 			rw_exit(&icmp->icmp_rwlock);
2250 			(void) icmp_bind_proto(connp);
2251 			rw_enter(&icmp->icmp_rwlock, RW_WRITER);
2252 			return (0);
2253 		case SO_REUSEADDR:
2254 			if (!checkonly) {
2255 				icmp->icmp_reuseaddr = onoff;
2256 				PASS_OPT_TO_IP(connp);
2257 			}
2258 			break;
2259 
2260 		/*
2261 		 * The following three items are available here,
2262 		 * but are only meaningful to IP.
2263 		 */
2264 		case SO_DONTROUTE:
2265 			if (!checkonly) {
2266 				icmp->icmp_dontroute = onoff;
2267 				PASS_OPT_TO_IP(connp);
2268 			}
2269 			break;
2270 		case SO_USELOOPBACK:
2271 			if (!checkonly) {
2272 				icmp->icmp_useloopback = onoff;
2273 				PASS_OPT_TO_IP(connp);
2274 			}
2275 			break;
2276 		case SO_BROADCAST:
2277 			if (!checkonly) {
2278 				icmp->icmp_broadcast = onoff;
2279 				PASS_OPT_TO_IP(connp);
2280 			}
2281 			break;
2282 
2283 		case SO_SNDBUF:
2284 			if (*i1 > is->is_max_buf) {
2285 				*outlenp = 0;
2286 				return (ENOBUFS);
2287 			}
2288 			if (!checkonly) {
2289 				if (!IPCL_IS_NONSTR(connp)) {
2290 					connp->conn_wq->q_hiwat = *i1;
2291 				}
2292 				icmp->icmp_xmit_hiwat = *i1;
2293 			}
2294 			break;
2295 		case SO_RCVBUF:
2296 			if (*i1 > is->is_max_buf) {
2297 				*outlenp = 0;
2298 				return (ENOBUFS);
2299 			}
2300 			if (!checkonly) {
2301 				icmp->icmp_recv_hiwat = *i1;
2302 				rw_exit(&icmp->icmp_rwlock);
2303 				(void) proto_set_rx_hiwat(connp->conn_rq, connp,
2304 				    *i1);
2305 				rw_enter(&icmp->icmp_rwlock, RW_WRITER);
2306 			}
2307 			break;
2308 		case SO_DGRAM_ERRIND:
2309 			if (!checkonly)
2310 				icmp->icmp_dgram_errind = onoff;
2311 			break;
2312 		case SO_ALLZONES:
2313 			/*
2314 			 * "soft" error (negative)
2315 			 * option not handled at this level
2316 			 * Note: Do not modify *outlenp
2317 			 */
2318 			return (-EINVAL);
2319 		case SO_TIMESTAMP:
2320 			if (!checkonly) {
2321 				icmp->icmp_timestamp = onoff;
2322 			}
2323 			break;
2324 		case SO_MAC_EXEMPT:
2325 			/*
2326 			 * "soft" error (negative)
2327 			 * option not handled at this level
2328 			 * Note: Do not modify *outlenp
2329 			 */
2330 			return (-EINVAL);
2331 		case SO_RCVTIMEO:
2332 		case SO_SNDTIMEO:
2333 			/*
2334 			 * Pass these two options in order for third part
2335 			 * protocol usage. Here just return directly.
2336 			 */
2337 			return (0);
2338 		/*
2339 		 * Following three not meaningful for icmp
2340 		 * Action is same as "default" so we keep them
2341 		 * in comments.
2342 		 * case SO_LINGER:
2343 		 * case SO_KEEPALIVE:
2344 		 * case SO_OOBINLINE:
2345 		 */
2346 		default:
2347 			*outlenp = 0;
2348 			return (EINVAL);
2349 		}
2350 		break;
2351 	case IPPROTO_IP:
2352 		/*
2353 		 * Only allow IPv4 option processing on IPv4 sockets.
2354 		 */
2355 		if (icmp->icmp_family != AF_INET) {
2356 			*outlenp = 0;
2357 			return (ENOPROTOOPT);
2358 		}
2359 		switch (name) {
2360 		case IP_OPTIONS:
2361 		case T_IP_OPTIONS:
2362 			/* Save options for use by IP. */
2363 			if ((inlen & 0x3) ||
2364 			    inlen + icmp->icmp_label_len > IP_MAX_OPT_LENGTH) {
2365 				*outlenp = 0;
2366 				return (EINVAL);
2367 			}
2368 			if (checkonly)
2369 				break;
2370 
2371 			if (!tsol_option_set(&icmp->icmp_ip_snd_options,
2372 			    &icmp->icmp_ip_snd_options_len,
2373 			    icmp->icmp_label_len, invalp, inlen)) {
2374 				*outlenp = 0;
2375 				return (ENOMEM);
2376 			}
2377 
2378 			icmp->icmp_max_hdr_len = IP_SIMPLE_HDR_LENGTH +
2379 			    icmp->icmp_ip_snd_options_len;
2380 			rw_exit(&icmp->icmp_rwlock);
2381 			(void) proto_set_tx_wroff(connp->conn_rq == NULL ? NULL:
2382 			    RD(connp->conn_rq), connp,
2383 			    icmp->icmp_max_hdr_len + is->is_wroff_extra);
2384 			rw_enter(&icmp->icmp_rwlock, RW_WRITER);
2385 			break;
2386 		case IP_HDRINCL:
2387 			if (!checkonly)
2388 				icmp->icmp_hdrincl = onoff;
2389 			break;
2390 		case IP_TOS:
2391 		case T_IP_TOS:
2392 			if (!checkonly) {
2393 				icmp->icmp_type_of_service = (uint8_t)*i1;
2394 			}
2395 			break;
2396 		case IP_TTL:
2397 			if (!checkonly) {
2398 				icmp->icmp_ttl = (uint8_t)*i1;
2399 			}
2400 			break;
2401 		case IP_MULTICAST_IF:
2402 			/*
2403 			 * TODO should check OPTMGMT reply and undo this if
2404 			 * there is an error.
2405 			 */
2406 			if (!checkonly) {
2407 				icmp->icmp_multicast_if_addr = *i1;
2408 				PASS_OPT_TO_IP(connp);
2409 			}
2410 			break;
2411 		case IP_MULTICAST_TTL:
2412 			if (!checkonly)
2413 				icmp->icmp_multicast_ttl = *invalp;
2414 			break;
2415 		case IP_MULTICAST_LOOP:
2416 			if (!checkonly) {
2417 				connp->conn_multicast_loop =
2418 				    (*invalp == 0) ? 0 : 1;
2419 				PASS_OPT_TO_IP(connp);
2420 			}
2421 			break;
2422 		case IP_BOUND_IF:
2423 			if (!checkonly) {
2424 				icmp->icmp_bound_if = *i1;
2425 				PASS_OPT_TO_IP(connp);
2426 			}
2427 			break;
2428 		case IP_UNSPEC_SRC:
2429 			if (!checkonly) {
2430 				icmp->icmp_unspec_source = onoff;
2431 				PASS_OPT_TO_IP(connp);
2432 			}
2433 			break;
2434 		case IP_BROADCAST_TTL:
2435 			if (!checkonly)
2436 				connp->conn_broadcast_ttl = *invalp;
2437 			break;
2438 		case IP_RECVIF:
2439 			if (!checkonly) {
2440 				icmp->icmp_recvif = onoff;
2441 			}
2442 			/*
2443 			 * pass to ip
2444 			 */
2445 			return (-EINVAL);
2446 		case IP_PKTINFO: {
2447 			/*
2448 			 * This also handles IP_RECVPKTINFO.
2449 			 * IP_PKTINFO and IP_RECVPKTINFO have the same value.
2450 			 * Differentiation is based on the size of the argument
2451 			 * passed in.
2452 			 */
2453 			struct in_pktinfo *pktinfop;
2454 			ip4_pkt_t *attr_pktinfop;
2455 
2456 			if (checkonly)
2457 				break;
2458 
2459 			if (inlen == sizeof (int)) {
2460 				/*
2461 				 * This is IP_RECVPKTINFO option.
2462 				 * Keep a local copy of wether this option is
2463 				 * set or not and pass it down to IP for
2464 				 * processing.
2465 				 */
2466 				icmp->icmp_ip_recvpktinfo = onoff;
2467 				return (-EINVAL);
2468 			}
2469 
2470 
2471 			if (inlen != sizeof (struct in_pktinfo)) {
2472 				return (EINVAL);
2473 			}
2474 
2475 			if ((attr_pktinfop = (ip4_pkt_t *)thisdg_attrs)
2476 			    == NULL) {
2477 				/*
2478 				 * sticky option is not supported
2479 				 */
2480 				return (EINVAL);
2481 			}
2482 
2483 			pktinfop = (struct in_pktinfo *)invalp;
2484 
2485 			/*
2486 			 * Atleast one of the values should be specified
2487 			 */
2488 			if (pktinfop->ipi_ifindex == 0 &&
2489 			    pktinfop->ipi_spec_dst.s_addr == INADDR_ANY) {
2490 				return (EINVAL);
2491 			}
2492 
2493 			attr_pktinfop->ip4_addr = pktinfop->ipi_spec_dst.s_addr;
2494 			attr_pktinfop->ip4_ill_index = pktinfop->ipi_ifindex;
2495 		}
2496 			break;
2497 		case IP_ADD_MEMBERSHIP:
2498 		case IP_DROP_MEMBERSHIP:
2499 		case IP_BLOCK_SOURCE:
2500 		case IP_UNBLOCK_SOURCE:
2501 		case IP_ADD_SOURCE_MEMBERSHIP:
2502 		case IP_DROP_SOURCE_MEMBERSHIP:
2503 		case MCAST_JOIN_GROUP:
2504 		case MCAST_LEAVE_GROUP:
2505 		case MCAST_BLOCK_SOURCE:
2506 		case MCAST_UNBLOCK_SOURCE:
2507 		case MCAST_JOIN_SOURCE_GROUP:
2508 		case MCAST_LEAVE_SOURCE_GROUP:
2509 		case MRT_INIT:
2510 		case MRT_DONE:
2511 		case MRT_ADD_VIF:
2512 		case MRT_DEL_VIF:
2513 		case MRT_ADD_MFC:
2514 		case MRT_DEL_MFC:
2515 		case MRT_VERSION:
2516 		case MRT_ASSERT:
2517 		case IP_SEC_OPT:
2518 		case IP_NEXTHOP:
2519 			/*
2520 			 * "soft" error (negative)
2521 			 * option not handled at this level
2522 			 * Note: Do not modify *outlenp
2523 			 */
2524 			return (-EINVAL);
2525 		default:
2526 			*outlenp = 0;
2527 			return (EINVAL);
2528 		}
2529 		break;
2530 	case IPPROTO_IPV6: {
2531 		ip6_pkt_t		*ipp;
2532 		boolean_t		sticky;
2533 
2534 		if (icmp->icmp_family != AF_INET6) {
2535 			*outlenp = 0;
2536 			return (ENOPROTOOPT);
2537 		}
2538 		/*
2539 		 * Deal with both sticky options and ancillary data
2540 		 */
2541 		if (thisdg_attrs == NULL) {
2542 			/* sticky options, or none */
2543 			ipp = &icmp->icmp_sticky_ipp;
2544 			sticky = B_TRUE;
2545 		} else {
2546 			/* ancillary data */
2547 			ipp = (ip6_pkt_t *)thisdg_attrs;
2548 			sticky = B_FALSE;
2549 		}
2550 
2551 		switch (name) {
2552 		case IPV6_MULTICAST_IF:
2553 			if (!checkonly) {
2554 				icmp->icmp_multicast_if_index = *i1;
2555 				PASS_OPT_TO_IP(connp);
2556 			}
2557 			break;
2558 		case IPV6_UNICAST_HOPS:
2559 			/* -1 means use default */
2560 			if (*i1 < -1 || *i1 > IPV6_MAX_HOPS) {
2561 				*outlenp = 0;
2562 				return (EINVAL);
2563 			}
2564 			if (!checkonly) {
2565 				if (*i1 == -1) {
2566 					icmp->icmp_ttl = ipp->ipp_unicast_hops =
2567 					    is->is_ipv6_hoplimit;
2568 					ipp->ipp_fields &= ~IPPF_UNICAST_HOPS;
2569 					/* Pass modified value to IP. */
2570 					*i1 = ipp->ipp_hoplimit;
2571 				} else {
2572 					icmp->icmp_ttl = ipp->ipp_unicast_hops =
2573 					    (uint8_t)*i1;
2574 					ipp->ipp_fields |= IPPF_UNICAST_HOPS;
2575 				}
2576 				/* Rebuild the header template */
2577 				error = icmp_build_hdrs(icmp);
2578 				if (error != 0) {
2579 					*outlenp = 0;
2580 					return (error);
2581 				}
2582 			}
2583 			break;
2584 		case IPV6_MULTICAST_HOPS:
2585 			/* -1 means use default */
2586 			if (*i1 < -1 || *i1 > IPV6_MAX_HOPS) {
2587 				*outlenp = 0;
2588 				return (EINVAL);
2589 			}
2590 			if (!checkonly) {
2591 				if (*i1 == -1) {
2592 					icmp->icmp_multicast_ttl =
2593 					    ipp->ipp_multicast_hops =
2594 					    IP_DEFAULT_MULTICAST_TTL;
2595 					ipp->ipp_fields &= ~IPPF_MULTICAST_HOPS;
2596 					/* Pass modified value to IP. */
2597 					*i1 = icmp->icmp_multicast_ttl;
2598 				} else {
2599 					icmp->icmp_multicast_ttl =
2600 					    ipp->ipp_multicast_hops =
2601 					    (uint8_t)*i1;
2602 					ipp->ipp_fields |= IPPF_MULTICAST_HOPS;
2603 				}
2604 			}
2605 			break;
2606 		case IPV6_MULTICAST_LOOP:
2607 			if (*i1 != 0 && *i1 != 1) {
2608 				*outlenp = 0;
2609 				return (EINVAL);
2610 			}
2611 			if (!checkonly) {
2612 				connp->conn_multicast_loop = *i1;
2613 				PASS_OPT_TO_IP(connp);
2614 			}
2615 			break;
2616 		case IPV6_CHECKSUM:
2617 			/*
2618 			 * Integer offset into the user data of where the
2619 			 * checksum is located.
2620 			 * Offset of -1 disables option.
2621 			 * Does not apply to IPPROTO_ICMPV6.
2622 			 */
2623 			if (icmp->icmp_proto == IPPROTO_ICMPV6 || !sticky) {
2624 				*outlenp = 0;
2625 				return (EINVAL);
2626 			}
2627 			if ((*i1 != -1) && ((*i1 < 0) || (*i1 & 0x1) != 0)) {
2628 				/* Negative or not 16 bit aligned offset */
2629 				*outlenp = 0;
2630 				return (EINVAL);
2631 			}
2632 			if (checkonly)
2633 				break;
2634 
2635 			if (*i1 == -1) {
2636 				icmp->icmp_raw_checksum = 0;
2637 				ipp->ipp_fields &= ~IPPF_RAW_CKSUM;
2638 			} else {
2639 				icmp->icmp_raw_checksum = 1;
2640 				icmp->icmp_checksum_off = *i1;
2641 				ipp->ipp_fields |= IPPF_RAW_CKSUM;
2642 			}
2643 			/* Rebuild the header template */
2644 			error = icmp_build_hdrs(icmp);
2645 			if (error != 0) {
2646 				*outlenp = 0;
2647 				return (error);
2648 			}
2649 			break;
2650 		case IPV6_JOIN_GROUP:
2651 		case IPV6_LEAVE_GROUP:
2652 		case MCAST_JOIN_GROUP:
2653 		case MCAST_LEAVE_GROUP:
2654 		case MCAST_BLOCK_SOURCE:
2655 		case MCAST_UNBLOCK_SOURCE:
2656 		case MCAST_JOIN_SOURCE_GROUP:
2657 		case MCAST_LEAVE_SOURCE_GROUP:
2658 			/*
2659 			 * "soft" error (negative)
2660 			 * option not handled at this level
2661 			 * Note: Do not modify *outlenp
2662 			 */
2663 			return (-EINVAL);
2664 		case IPV6_BOUND_IF:
2665 			if (!checkonly) {
2666 				icmp->icmp_bound_if = *i1;
2667 				PASS_OPT_TO_IP(connp);
2668 			}
2669 			break;
2670 		case IPV6_UNSPEC_SRC:
2671 			if (!checkonly) {
2672 				icmp->icmp_unspec_source = onoff;
2673 				PASS_OPT_TO_IP(connp);
2674 			}
2675 			break;
2676 		case IPV6_RECVTCLASS:
2677 			if (!checkonly) {
2678 				icmp->icmp_ipv6_recvtclass = onoff;
2679 				PASS_OPT_TO_IP(connp);
2680 			}
2681 			break;
2682 		/*
2683 		 * Set boolean switches for ancillary data delivery
2684 		 */
2685 		case IPV6_RECVPKTINFO:
2686 			if (!checkonly) {
2687 				icmp->icmp_ip_recvpktinfo = onoff;
2688 				PASS_OPT_TO_IP(connp);
2689 			}
2690 			break;
2691 		case IPV6_RECVPATHMTU:
2692 			if (!checkonly) {
2693 				icmp->icmp_ipv6_recvpathmtu = onoff;
2694 				PASS_OPT_TO_IP(connp);
2695 			}
2696 			break;
2697 		case IPV6_RECVHOPLIMIT:
2698 			if (!checkonly) {
2699 				icmp->icmp_ipv6_recvhoplimit = onoff;
2700 				PASS_OPT_TO_IP(connp);
2701 			}
2702 			break;
2703 		case IPV6_RECVHOPOPTS:
2704 			if (!checkonly) {
2705 				icmp->icmp_ipv6_recvhopopts = onoff;
2706 				PASS_OPT_TO_IP(connp);
2707 			}
2708 			break;
2709 		case IPV6_RECVDSTOPTS:
2710 			if (!checkonly) {
2711 				icmp->icmp_ipv6_recvdstopts = onoff;
2712 				PASS_OPT_TO_IP(connp);
2713 			}
2714 			break;
2715 		case _OLD_IPV6_RECVDSTOPTS:
2716 			if (!checkonly)
2717 				icmp->icmp_old_ipv6_recvdstopts = onoff;
2718 			break;
2719 		case IPV6_RECVRTHDRDSTOPTS:
2720 			if (!checkonly) {
2721 				icmp->icmp_ipv6_recvrtdstopts = onoff;
2722 				PASS_OPT_TO_IP(connp);
2723 			}
2724 			break;
2725 		case IPV6_RECVRTHDR:
2726 			if (!checkonly) {
2727 				icmp->icmp_ipv6_recvrthdr = onoff;
2728 				PASS_OPT_TO_IP(connp);
2729 			}
2730 			break;
2731 		/*
2732 		 * Set sticky options or ancillary data.
2733 		 * If sticky options, (re)build any extension headers
2734 		 * that might be needed as a result.
2735 		 */
2736 		case IPV6_PKTINFO:
2737 			/*
2738 			 * The source address and ifindex are verified
2739 			 * in ip_opt_set(). For ancillary data the
2740 			 * source address is checked in ip_wput_v6.
2741 			 */
2742 			if (inlen != 0 && inlen !=
2743 			    sizeof (struct in6_pktinfo)) {
2744 				return (EINVAL);
2745 			}
2746 			if (checkonly)
2747 				break;
2748 
2749 			if (inlen == 0) {
2750 				ipp->ipp_fields &= ~(IPPF_IFINDEX|IPPF_ADDR);
2751 				ipp->ipp_sticky_ignored |=
2752 				    (IPPF_IFINDEX|IPPF_ADDR);
2753 			} else {
2754 				struct in6_pktinfo *pkti;
2755 
2756 				pkti = (struct in6_pktinfo *)invalp;
2757 				ipp->ipp_ifindex = pkti->ipi6_ifindex;
2758 				ipp->ipp_addr = pkti->ipi6_addr;
2759 				if (ipp->ipp_ifindex != 0)
2760 					ipp->ipp_fields |= IPPF_IFINDEX;
2761 				else
2762 					ipp->ipp_fields &= ~IPPF_IFINDEX;
2763 				if (!IN6_IS_ADDR_UNSPECIFIED(
2764 				    &ipp->ipp_addr))
2765 					ipp->ipp_fields |= IPPF_ADDR;
2766 				else
2767 					ipp->ipp_fields &= ~IPPF_ADDR;
2768 			}
2769 			if (sticky) {
2770 				error = icmp_build_hdrs(icmp);
2771 				if (error != 0)
2772 					return (error);
2773 				PASS_OPT_TO_IP(connp);
2774 			}
2775 			break;
2776 		case IPV6_HOPLIMIT:
2777 			/* This option can only be used as ancillary data. */
2778 			if (sticky)
2779 				return (EINVAL);
2780 			if (inlen != 0 && inlen != sizeof (int))
2781 				return (EINVAL);
2782 			if (checkonly)
2783 				break;
2784 
2785 			if (inlen == 0) {
2786 				ipp->ipp_fields &= ~IPPF_HOPLIMIT;
2787 				ipp->ipp_sticky_ignored |= IPPF_HOPLIMIT;
2788 			} else {
2789 				if (*i1 > 255 || *i1 < -1)
2790 					return (EINVAL);
2791 				if (*i1 == -1)
2792 					ipp->ipp_hoplimit =
2793 					    is->is_ipv6_hoplimit;
2794 				else
2795 					ipp->ipp_hoplimit = *i1;
2796 				ipp->ipp_fields |= IPPF_HOPLIMIT;
2797 			}
2798 			break;
2799 		case IPV6_TCLASS:
2800 			/*
2801 			 * IPV6_RECVTCLASS accepts -1 as use kernel default
2802 			 * and [0, 255] as the actualy traffic class.
2803 			 */
2804 			if (inlen != 0 && inlen != sizeof (int)) {
2805 				return (EINVAL);
2806 			}
2807 			if (checkonly)
2808 				break;
2809 
2810 			if (inlen == 0) {
2811 				ipp->ipp_fields &= ~IPPF_TCLASS;
2812 				ipp->ipp_sticky_ignored |= IPPF_TCLASS;
2813 			} else {
2814 				if (*i1 >= 256 || *i1 < -1)
2815 					return (EINVAL);
2816 				if (*i1 == -1) {
2817 					ipp->ipp_tclass =
2818 					    IPV6_FLOW_TCLASS(
2819 					    IPV6_DEFAULT_VERS_AND_FLOW);
2820 				} else {
2821 					ipp->ipp_tclass = *i1;
2822 				}
2823 				ipp->ipp_fields |= IPPF_TCLASS;
2824 			}
2825 			if (sticky) {
2826 				error = icmp_build_hdrs(icmp);
2827 				if (error != 0)
2828 					return (error);
2829 			}
2830 			break;
2831 		case IPV6_NEXTHOP:
2832 			/*
2833 			 * IP will verify that the nexthop is reachable
2834 			 * and fail for sticky options.
2835 			 */
2836 			if (inlen != 0 && inlen != sizeof (sin6_t)) {
2837 				return (EINVAL);
2838 			}
2839 			if (checkonly)
2840 				break;
2841 
2842 			if (inlen == 0) {
2843 				ipp->ipp_fields &= ~IPPF_NEXTHOP;
2844 				ipp->ipp_sticky_ignored |= IPPF_NEXTHOP;
2845 			} else {
2846 				sin6_t *sin6 = (sin6_t *)invalp;
2847 
2848 				if (sin6->sin6_family != AF_INET6) {
2849 					return (EAFNOSUPPORT);
2850 				}
2851 				if (IN6_IS_ADDR_V4MAPPED(&sin6->sin6_addr)) {
2852 					return (EADDRNOTAVAIL);
2853 				}
2854 				ipp->ipp_nexthop = sin6->sin6_addr;
2855 				if (!IN6_IS_ADDR_UNSPECIFIED(
2856 				    &ipp->ipp_nexthop))
2857 					ipp->ipp_fields |= IPPF_NEXTHOP;
2858 				else
2859 					ipp->ipp_fields &= ~IPPF_NEXTHOP;
2860 			}
2861 			if (sticky) {
2862 				error = icmp_build_hdrs(icmp);
2863 				if (error != 0)
2864 					return (error);
2865 				PASS_OPT_TO_IP(connp);
2866 			}
2867 			break;
2868 		case IPV6_HOPOPTS: {
2869 			ip6_hbh_t *hopts = (ip6_hbh_t *)invalp;
2870 			/*
2871 			 * Sanity checks - minimum size, size a multiple of
2872 			 * eight bytes, and matching size passed in.
2873 			 */
2874 			if (inlen != 0 &&
2875 			    inlen != (8 * (hopts->ip6h_len + 1))) {
2876 				return (EINVAL);
2877 			}
2878 
2879 			if (checkonly)
2880 				break;
2881 			error = optcom_pkt_set(invalp, inlen, sticky,
2882 			    (uchar_t **)&ipp->ipp_hopopts,
2883 			    &ipp->ipp_hopoptslen,
2884 			    sticky ? icmp->icmp_label_len_v6 : 0);
2885 			if (error != 0)
2886 				return (error);
2887 			if (ipp->ipp_hopoptslen == 0) {
2888 				ipp->ipp_fields &= ~IPPF_HOPOPTS;
2889 				ipp->ipp_sticky_ignored |= IPPF_HOPOPTS;
2890 			} else {
2891 				ipp->ipp_fields |= IPPF_HOPOPTS;
2892 			}
2893 			if (sticky) {
2894 				error = icmp_build_hdrs(icmp);
2895 				if (error != 0)
2896 					return (error);
2897 			}
2898 			break;
2899 		}
2900 		case IPV6_RTHDRDSTOPTS: {
2901 			ip6_dest_t *dopts = (ip6_dest_t *)invalp;
2902 
2903 			/*
2904 			 * Sanity checks - minimum size, size a multiple of
2905 			 * eight bytes, and matching size passed in.
2906 			 */
2907 			if (inlen != 0 &&
2908 			    inlen != (8 * (dopts->ip6d_len + 1)))
2909 				return (EINVAL);
2910 
2911 			if (checkonly)
2912 				break;
2913 
2914 			if (inlen == 0) {
2915 				if (sticky &&
2916 				    (ipp->ipp_fields & IPPF_RTDSTOPTS) != 0) {
2917 					kmem_free(ipp->ipp_rtdstopts,
2918 					    ipp->ipp_rtdstoptslen);
2919 					ipp->ipp_rtdstopts = NULL;
2920 					ipp->ipp_rtdstoptslen = 0;
2921 				}
2922 				ipp->ipp_fields &= ~IPPF_RTDSTOPTS;
2923 				ipp->ipp_sticky_ignored |= IPPF_RTDSTOPTS;
2924 			} else {
2925 				error = optcom_pkt_set(invalp, inlen, sticky,
2926 				    (uchar_t **)&ipp->ipp_rtdstopts,
2927 				    &ipp->ipp_rtdstoptslen, 0);
2928 				if (error != 0)
2929 					return (error);
2930 				ipp->ipp_fields |= IPPF_RTDSTOPTS;
2931 			}
2932 			if (sticky) {
2933 				error = icmp_build_hdrs(icmp);
2934 				if (error != 0)
2935 					return (error);
2936 			}
2937 			break;
2938 		}
2939 		case IPV6_DSTOPTS: {
2940 			ip6_dest_t *dopts = (ip6_dest_t *)invalp;
2941 
2942 			/*
2943 			 * Sanity checks - minimum size, size a multiple of
2944 			 * eight bytes, and matching size passed in.
2945 			 */
2946 			if (inlen != 0 &&
2947 			    inlen != (8 * (dopts->ip6d_len + 1)))
2948 				return (EINVAL);
2949 
2950 			if (checkonly)
2951 				break;
2952 
2953 			if (inlen == 0) {
2954 				if (sticky &&
2955 				    (ipp->ipp_fields & IPPF_DSTOPTS) != 0) {
2956 					kmem_free(ipp->ipp_dstopts,
2957 					    ipp->ipp_dstoptslen);
2958 					ipp->ipp_dstopts = NULL;
2959 					ipp->ipp_dstoptslen = 0;
2960 				}
2961 				ipp->ipp_fields &= ~IPPF_DSTOPTS;
2962 				ipp->ipp_sticky_ignored |= IPPF_DSTOPTS;
2963 			} else {
2964 				error = optcom_pkt_set(invalp, inlen, sticky,
2965 				    (uchar_t **)&ipp->ipp_dstopts,
2966 				    &ipp->ipp_dstoptslen, 0);
2967 				if (error != 0)
2968 					return (error);
2969 				ipp->ipp_fields |= IPPF_DSTOPTS;
2970 			}
2971 			if (sticky) {
2972 				error = icmp_build_hdrs(icmp);
2973 				if (error != 0)
2974 					return (error);
2975 			}
2976 			break;
2977 		}
2978 		case IPV6_RTHDR: {
2979 			ip6_rthdr_t *rt = (ip6_rthdr_t *)invalp;
2980 
2981 			/*
2982 			 * Sanity checks - minimum size, size a multiple of
2983 			 * eight bytes, and matching size passed in.
2984 			 */
2985 			if (inlen != 0 &&
2986 			    inlen != (8 * (rt->ip6r_len + 1)))
2987 				return (EINVAL);
2988 
2989 			if (checkonly)
2990 				break;
2991 
2992 			if (inlen == 0) {
2993 				if (sticky &&
2994 				    (ipp->ipp_fields & IPPF_RTHDR) != 0) {
2995 					kmem_free(ipp->ipp_rthdr,
2996 					    ipp->ipp_rthdrlen);
2997 					ipp->ipp_rthdr = NULL;
2998 					ipp->ipp_rthdrlen = 0;
2999 				}
3000 				ipp->ipp_fields &= ~IPPF_RTHDR;
3001 				ipp->ipp_sticky_ignored |= IPPF_RTHDR;
3002 			} else {
3003 				error = optcom_pkt_set(invalp, inlen, sticky,
3004 				    (uchar_t **)&ipp->ipp_rthdr,
3005 				    &ipp->ipp_rthdrlen, 0);
3006 				if (error != 0)
3007 					return (error);
3008 				ipp->ipp_fields |= IPPF_RTHDR;
3009 			}
3010 			if (sticky) {
3011 				error = icmp_build_hdrs(icmp);
3012 				if (error != 0)
3013 					return (error);
3014 			}
3015 			break;
3016 		}
3017 
3018 		case IPV6_DONTFRAG:
3019 			if (checkonly)
3020 				break;
3021 
3022 			if (onoff) {
3023 				ipp->ipp_fields |= IPPF_DONTFRAG;
3024 			} else {
3025 				ipp->ipp_fields &= ~IPPF_DONTFRAG;
3026 			}
3027 			break;
3028 
3029 		case IPV6_USE_MIN_MTU:
3030 			if (inlen != sizeof (int))
3031 				return (EINVAL);
3032 
3033 			if (*i1 < -1 || *i1 > 1)
3034 				return (EINVAL);
3035 
3036 			if (checkonly)
3037 				break;
3038 
3039 			ipp->ipp_fields |= IPPF_USE_MIN_MTU;
3040 			ipp->ipp_use_min_mtu = *i1;
3041 			break;
3042 
3043 		/*
3044 		 * This option can't be set.  Its only returned via
3045 		 * getsockopt() or ancillary data.
3046 		 */
3047 		case IPV6_PATHMTU:
3048 			return (EINVAL);
3049 
3050 		case IPV6_SEC_OPT:
3051 		case IPV6_SRC_PREFERENCES:
3052 		case IPV6_V6ONLY:
3053 			/* Handled at IP level */
3054 			return (-EINVAL);
3055 		default:
3056 			*outlenp = 0;
3057 			return (EINVAL);
3058 		}
3059 		break;
3060 	}		/* end IPPROTO_IPV6 */
3061 
3062 	case IPPROTO_ICMPV6:
3063 		/*
3064 		 * Only allow IPv6 option processing on IPv6 sockets.
3065 		 */
3066 		if (icmp->icmp_family != AF_INET6) {
3067 			*outlenp = 0;
3068 			return (ENOPROTOOPT);
3069 		}
3070 		if (icmp->icmp_proto != IPPROTO_ICMPV6) {
3071 			*outlenp = 0;
3072 			return (ENOPROTOOPT);
3073 		}
3074 		switch (name) {
3075 		case ICMP6_FILTER:
3076 			if (!checkonly) {
3077 				if ((inlen != 0) &&
3078 				    (inlen != sizeof (icmp6_filter_t)))
3079 					return (EINVAL);
3080 
3081 				if (inlen == 0) {
3082 					if (icmp->icmp_filter != NULL) {
3083 						kmem_free(icmp->icmp_filter,
3084 						    sizeof (icmp6_filter_t));
3085 						icmp->icmp_filter = NULL;
3086 					}
3087 				} else {
3088 					if (icmp->icmp_filter == NULL) {
3089 						icmp->icmp_filter = kmem_alloc(
3090 						    sizeof (icmp6_filter_t),
3091 						    KM_NOSLEEP);
3092 						if (icmp->icmp_filter == NULL) {
3093 							*outlenp = 0;
3094 							return (ENOBUFS);
3095 						}
3096 					}
3097 					(void) bcopy(invalp, icmp->icmp_filter,
3098 					    inlen);
3099 				}
3100 			}
3101 			break;
3102 
3103 		default:
3104 			*outlenp = 0;
3105 			return (EINVAL);
3106 		}
3107 		break;
3108 	default:
3109 		*outlenp = 0;
3110 		return (EINVAL);
3111 	}
3112 	/*
3113 	 * Common case of OK return with outval same as inval.
3114 	 */
3115 	if (invalp != outvalp) {
3116 		/* don't trust bcopy for identical src/dst */
3117 		(void) bcopy(invalp, outvalp, inlen);
3118 	}
3119 	*outlenp = inlen;
3120 	return (0);
3121 }
3122 
3123 /* This routine sets socket options. */
3124 /* ARGSUSED */
3125 int
3126 icmp_opt_set(conn_t *connp, uint_t optset_context, int level, int name,
3127     uint_t inlen, uchar_t *invalp, uint_t *outlenp, uchar_t *outvalp,
3128     void *thisdg_attrs, cred_t *cr)
3129 {
3130 	boolean_t checkonly;
3131 	int	error;
3132 
3133 	error = 0;
3134 	switch (optset_context) {
3135 	case SETFN_OPTCOM_CHECKONLY:
3136 		checkonly = B_TRUE;
3137 		/*
3138 		 * Note: Implies T_CHECK semantics for T_OPTCOM_REQ
3139 		 * inlen != 0 implies value supplied and
3140 		 * 	we have to "pretend" to set it.
3141 		 * inlen == 0 implies that there is no
3142 		 * 	value part in T_CHECK request and just validation
3143 		 * done elsewhere should be enough, we just return here.
3144 		 */
3145 		if (inlen == 0) {
3146 			*outlenp = 0;
3147 			error = 0;
3148 			goto done;
3149 		}
3150 		break;
3151 	case SETFN_OPTCOM_NEGOTIATE:
3152 		checkonly = B_FALSE;
3153 		break;
3154 	case SETFN_UD_NEGOTIATE:
3155 	case SETFN_CONN_NEGOTIATE:
3156 		checkonly = B_FALSE;
3157 		/*
3158 		 * Negotiating local and "association-related" options
3159 		 * through T_UNITDATA_REQ.
3160 		 *
3161 		 * Following routine can filter out ones we do not
3162 		 * want to be "set" this way.
3163 		 */
3164 		if (!icmp_opt_allow_udr_set(level, name)) {
3165 			*outlenp = 0;
3166 			error = EINVAL;
3167 			goto done;
3168 		}
3169 		break;
3170 	default:
3171 		/*
3172 		 * We should never get here
3173 		 */
3174 		*outlenp = 0;
3175 		error = EINVAL;
3176 		goto done;
3177 	}
3178 
3179 	ASSERT((optset_context != SETFN_OPTCOM_CHECKONLY) ||
3180 	    (optset_context == SETFN_OPTCOM_CHECKONLY && inlen != 0));
3181 	error = icmp_do_opt_set(connp, level, name, inlen, invalp, outlenp,
3182 	    outvalp, cr, thisdg_attrs, checkonly);
3183 
3184 done:
3185 	return (error);
3186 }
3187 
3188 /* This routine sets socket options. */
3189 /* ARGSUSED */
3190 int
3191 icmp_tpi_opt_set(queue_t *q, uint_t optset_context, int level, int name,
3192     uint_t inlen, uchar_t *invalp, uint_t *outlenp, uchar_t *outvalp,
3193     void *thisdg_attrs, cred_t *cr, mblk_t *mblk)
3194 {
3195 	conn_t	*connp =  Q_TO_CONN(q);
3196 	icmp_t	*icmp;
3197 	int error;
3198 
3199 	icmp = connp->conn_icmp;
3200 	rw_enter(&icmp->icmp_rwlock, RW_WRITER);
3201 	error = icmp_opt_set(connp, optset_context, level, name, inlen, invalp,
3202 	    outlenp, outvalp, thisdg_attrs, cr);
3203 	rw_exit(&icmp->icmp_rwlock);
3204 	return (error);
3205 }
3206 
3207 /*
3208  * Update icmp_sticky_hdrs based on icmp_sticky_ipp, icmp_v6src, icmp_ttl,
3209  * icmp_proto, icmp_raw_checksum and icmp_no_tp_cksum.
3210  * The headers include ip6i_t (if needed), ip6_t, and any sticky extension
3211  * headers.
3212  * Returns failure if can't allocate memory.
3213  */
3214 static int
3215 icmp_build_hdrs(icmp_t *icmp)
3216 {
3217 	icmp_stack_t *is = icmp->icmp_is;
3218 	uchar_t	*hdrs;
3219 	uint_t	hdrs_len;
3220 	ip6_t	*ip6h;
3221 	ip6i_t	*ip6i;
3222 	ip6_pkt_t *ipp = &icmp->icmp_sticky_ipp;
3223 
3224 	ASSERT(RW_WRITE_HELD(&icmp->icmp_rwlock));
3225 	hdrs_len = ip_total_hdrs_len_v6(ipp);
3226 	ASSERT(hdrs_len != 0);
3227 	if (hdrs_len != icmp->icmp_sticky_hdrs_len) {
3228 		/* Need to reallocate */
3229 		if (hdrs_len != 0) {
3230 			hdrs = kmem_alloc(hdrs_len, KM_NOSLEEP);
3231 			if (hdrs == NULL)
3232 				return (ENOMEM);
3233 		} else {
3234 			hdrs = NULL;
3235 		}
3236 		if (icmp->icmp_sticky_hdrs_len != 0) {
3237 			kmem_free(icmp->icmp_sticky_hdrs,
3238 			    icmp->icmp_sticky_hdrs_len);
3239 		}
3240 		icmp->icmp_sticky_hdrs = hdrs;
3241 		icmp->icmp_sticky_hdrs_len = hdrs_len;
3242 	}
3243 	ip_build_hdrs_v6(icmp->icmp_sticky_hdrs,
3244 	    icmp->icmp_sticky_hdrs_len, ipp, icmp->icmp_proto);
3245 
3246 	/* Set header fields not in ipp */
3247 	if (ipp->ipp_fields & IPPF_HAS_IP6I) {
3248 		ip6i = (ip6i_t *)icmp->icmp_sticky_hdrs;
3249 		ip6h = (ip6_t *)&ip6i[1];
3250 
3251 		if (ipp->ipp_fields & IPPF_RAW_CKSUM) {
3252 			ip6i->ip6i_flags |= IP6I_RAW_CHECKSUM;
3253 			ip6i->ip6i_checksum_off = icmp->icmp_checksum_off;
3254 		}
3255 		if (ipp->ipp_fields & IPPF_NO_CKSUM) {
3256 			ip6i->ip6i_flags |= IP6I_NO_ULP_CKSUM;
3257 		}
3258 	} else {
3259 		ip6h = (ip6_t *)icmp->icmp_sticky_hdrs;
3260 	}
3261 
3262 	if (!(ipp->ipp_fields & IPPF_ADDR))
3263 		ip6h->ip6_src = icmp->icmp_v6src;
3264 
3265 	/* Try to get everything in a single mblk */
3266 	if (hdrs_len > icmp->icmp_max_hdr_len) {
3267 		icmp->icmp_max_hdr_len = hdrs_len;
3268 		rw_exit(&icmp->icmp_rwlock);
3269 		(void) proto_set_tx_wroff(icmp->icmp_connp->conn_rq,
3270 		    icmp->icmp_connp,
3271 		    icmp->icmp_max_hdr_len + is->is_wroff_extra);
3272 		rw_enter(&icmp->icmp_rwlock, RW_WRITER);
3273 	}
3274 	return (0);
3275 }
3276 
3277 /*
3278  * This routine retrieves the value of an ND variable in a icmpparam_t
3279  * structure.  It is called through nd_getset when a user reads the
3280  * variable.
3281  */
3282 /* ARGSUSED */
3283 static int
3284 icmp_param_get(queue_t *q, mblk_t *mp, caddr_t cp, cred_t *cr)
3285 {
3286 	icmpparam_t	*icmppa = (icmpparam_t *)cp;
3287 
3288 	(void) mi_mpprintf(mp, "%d", icmppa->icmp_param_value);
3289 	return (0);
3290 }
3291 
3292 /*
3293  * Walk through the param array specified registering each element with the
3294  * named dispatch (ND) handler.
3295  */
3296 static boolean_t
3297 icmp_param_register(IDP *ndp, icmpparam_t *icmppa, int cnt)
3298 {
3299 	for (; cnt-- > 0; icmppa++) {
3300 		if (icmppa->icmp_param_name && icmppa->icmp_param_name[0]) {
3301 			if (!nd_load(ndp, icmppa->icmp_param_name,
3302 			    icmp_param_get, icmp_param_set,
3303 			    (caddr_t)icmppa)) {
3304 				nd_free(ndp);
3305 				return (B_FALSE);
3306 			}
3307 		}
3308 	}
3309 	if (!nd_load(ndp, "icmp_status", icmp_status_report, NULL,
3310 	    NULL)) {
3311 		nd_free(ndp);
3312 		return (B_FALSE);
3313 	}
3314 	return (B_TRUE);
3315 }
3316 
3317 /* This routine sets an ND variable in a icmpparam_t structure. */
3318 /* ARGSUSED */
3319 static int
3320 icmp_param_set(queue_t *q, mblk_t *mp, char *value, caddr_t cp, cred_t *cr)
3321 {
3322 	long		new_value;
3323 	icmpparam_t	*icmppa = (icmpparam_t *)cp;
3324 
3325 	/*
3326 	 * Fail the request if the new value does not lie within the
3327 	 * required bounds.
3328 	 */
3329 	if (ddi_strtol(value, NULL, 10, &new_value) != 0 ||
3330 	    new_value < icmppa->icmp_param_min ||
3331 	    new_value > icmppa->icmp_param_max) {
3332 		return (EINVAL);
3333 	}
3334 	/* Set the new value */
3335 	icmppa->icmp_param_value = new_value;
3336 	return (0);
3337 }
3338 
3339 static mblk_t *
3340 icmp_queue_fallback(icmp_t *icmp, mblk_t *mp)
3341 {
3342 	ASSERT(MUTEX_HELD(&icmp->icmp_recv_lock));
3343 	if (IPCL_IS_NONSTR(icmp->icmp_connp)) {
3344 		/*
3345 		 * fallback has started but messages have not been moved yet
3346 		 */
3347 		if (icmp->icmp_fallback_queue_head == NULL) {
3348 			ASSERT(icmp->icmp_fallback_queue_tail == NULL);
3349 			icmp->icmp_fallback_queue_head = mp;
3350 			icmp->icmp_fallback_queue_tail = mp;
3351 		} else {
3352 			ASSERT(icmp->icmp_fallback_queue_tail != NULL);
3353 			icmp->icmp_fallback_queue_tail->b_next = mp;
3354 			icmp->icmp_fallback_queue_tail = mp;
3355 		}
3356 		return (NULL);
3357 	} else {
3358 		/*
3359 		 * Fallback completed, let the caller putnext() the mblk.
3360 		 */
3361 		return (mp);
3362 	}
3363 }
3364 
3365 /*
3366  * Deliver data to ULP. In case we have a socket, and it's falling back to
3367  * TPI, then we'll queue the mp for later processing.
3368  */
3369 static void
3370 icmp_ulp_recv(conn_t *connp, mblk_t *mp)
3371 {
3372 
3373 	if (IPCL_IS_NONSTR(connp)) {
3374 		icmp_t *icmp = connp->conn_icmp;
3375 		int error;
3376 
3377 		if ((*connp->conn_upcalls->su_recv)
3378 		    (connp->conn_upper_handle, mp, msgdsize(mp), 0, &error,
3379 		    NULL) < 0) {
3380 			mutex_enter(&icmp->icmp_recv_lock);
3381 			if (error == ENOSPC) {
3382 				/*
3383 				 * let's confirm while holding the lock
3384 				 */
3385 				if ((*connp->conn_upcalls->su_recv)
3386 				    (connp->conn_upper_handle, NULL, 0, 0,
3387 				    &error, NULL) < 0) {
3388 					ASSERT(error == ENOSPC);
3389 					if (error == ENOSPC) {
3390 						connp->conn_flow_cntrld =
3391 						    B_TRUE;
3392 					}
3393 				}
3394 				mutex_exit(&icmp->icmp_recv_lock);
3395 			} else {
3396 				ASSERT(error == EOPNOTSUPP);
3397 				mp = icmp_queue_fallback(icmp, mp);
3398 				mutex_exit(&icmp->icmp_recv_lock);
3399 				if (mp != NULL)
3400 					putnext(connp->conn_rq, mp);
3401 			}
3402 		}
3403 		ASSERT(MUTEX_NOT_HELD(&icmp->icmp_recv_lock));
3404 	} else {
3405 		putnext(connp->conn_rq, mp);
3406 	}
3407 }
3408 
3409 /*ARGSUSED2*/
3410 static void
3411 icmp_input(void *arg1, mblk_t *mp, void *arg2)
3412 {
3413 	conn_t *connp = (conn_t *)arg1;
3414 	struct T_unitdata_ind	*tudi;
3415 	uchar_t			*rptr;
3416 	icmp_t			*icmp;
3417 	icmp_stack_t		*is;
3418 	sin_t			*sin;
3419 	sin6_t			*sin6;
3420 	ip6_t			*ip6h;
3421 	ip6i_t			*ip6i;
3422 	mblk_t			*mp1;
3423 	int			hdr_len;
3424 	ipha_t			*ipha;
3425 	int			udi_size;	/* Size of T_unitdata_ind */
3426 	uint_t			ipvers;
3427 	ip6_pkt_t		ipp;
3428 	uint8_t			nexthdr;
3429 	ip_pktinfo_t		*pinfo = NULL;
3430 	mblk_t			*options_mp = NULL;
3431 	uint_t			icmp_opt = 0;
3432 	boolean_t		icmp_ipv6_recvhoplimit = B_FALSE;
3433 	uint_t			hopstrip;
3434 
3435 	ASSERT(connp->conn_flags & IPCL_RAWIPCONN);
3436 
3437 	icmp = connp->conn_icmp;
3438 	is = icmp->icmp_is;
3439 	rptr = mp->b_rptr;
3440 	ASSERT(DB_TYPE(mp) == M_DATA || DB_TYPE(mp) == M_CTL);
3441 	ASSERT(OK_32PTR(rptr));
3442 
3443 	/*
3444 	 * IP should have prepended the options data in an M_CTL
3445 	 * Check M_CTL "type" to make sure are not here bcos of
3446 	 * a valid ICMP message
3447 	 */
3448 	if (DB_TYPE(mp) == M_CTL) {
3449 		/*
3450 		 * FIXME: does IP still do this?
3451 		 * IP sends up the IPSEC_IN message for handling IPSEC
3452 		 * policy at the TCP level. We don't need it here.
3453 		 */
3454 		if (*(uint32_t *)(mp->b_rptr) == IPSEC_IN) {
3455 			mp1 = mp->b_cont;
3456 			freeb(mp);
3457 			mp = mp1;
3458 			rptr = mp->b_rptr;
3459 		} else if (MBLKL(mp) == sizeof (ip_pktinfo_t) &&
3460 		    ((ip_pktinfo_t *)mp->b_rptr)->ip_pkt_ulp_type ==
3461 		    IN_PKTINFO) {
3462 			/*
3463 			 * IP_RECVIF or IP_RECVSLLA or IPF_RECVADDR information
3464 			 * has been prepended to the packet by IP. We need to
3465 			 * extract the mblk and adjust the rptr
3466 			 */
3467 			pinfo = (ip_pktinfo_t *)mp->b_rptr;
3468 			options_mp = mp;
3469 			mp = mp->b_cont;
3470 			rptr = mp->b_rptr;
3471 		} else {
3472 			/*
3473 			 * ICMP messages.
3474 			 */
3475 			icmp_icmp_error(connp, mp);
3476 			return;
3477 		}
3478 	}
3479 
3480 	/*
3481 	 * Discard message if it is misaligned or smaller than the IP header.
3482 	 */
3483 	if (!OK_32PTR(rptr) || (mp->b_wptr - rptr) < sizeof (ipha_t)) {
3484 		freemsg(mp);
3485 		if (options_mp != NULL)
3486 			freeb(options_mp);
3487 		BUMP_MIB(&is->is_rawip_mib, rawipInErrors);
3488 		return;
3489 	}
3490 	ipvers = IPH_HDR_VERSION((ipha_t *)rptr);
3491 
3492 	/* Handle M_DATA messages containing IP packets messages */
3493 	if (ipvers == IPV4_VERSION) {
3494 		/*
3495 		 * Special case where IP attaches
3496 		 * the IRE needs to be handled so that we don't send up
3497 		 * IRE to the user land.
3498 		 */
3499 		ipha = (ipha_t *)rptr;
3500 		hdr_len = IPH_HDR_LENGTH(ipha);
3501 
3502 		if (ipha->ipha_protocol == IPPROTO_TCP) {
3503 			tcph_t *tcph = (tcph_t *)&mp->b_rptr[hdr_len];
3504 
3505 			if (((tcph->th_flags[0] & (TH_SYN|TH_ACK)) ==
3506 			    TH_SYN) && mp->b_cont != NULL) {
3507 				mp1 = mp->b_cont;
3508 				if (mp1->b_datap->db_type == IRE_DB_TYPE) {
3509 					freeb(mp1);
3510 					mp->b_cont = NULL;
3511 				}
3512 			}
3513 		}
3514 		if (is->is_bsd_compat) {
3515 			ushort_t len;
3516 			len = ntohs(ipha->ipha_length);
3517 
3518 			if (mp->b_datap->db_ref > 1) {
3519 				/*
3520 				 * Allocate a new IP header so that we can
3521 				 * modify ipha_length.
3522 				 */
3523 				mblk_t	*mp1;
3524 
3525 				mp1 = allocb(hdr_len, BPRI_MED);
3526 				if (!mp1) {
3527 					freemsg(mp);
3528 					if (options_mp != NULL)
3529 						freeb(options_mp);
3530 					BUMP_MIB(&is->is_rawip_mib,
3531 					    rawipInErrors);
3532 					return;
3533 				}
3534 				bcopy(rptr, mp1->b_rptr, hdr_len);
3535 				mp->b_rptr = rptr + hdr_len;
3536 				rptr = mp1->b_rptr;
3537 				ipha = (ipha_t *)rptr;
3538 				mp1->b_cont = mp;
3539 				mp1->b_wptr = rptr + hdr_len;
3540 				mp = mp1;
3541 			}
3542 			len -= hdr_len;
3543 			ipha->ipha_length = htons(len);
3544 		}
3545 	}
3546 
3547 	/*
3548 	 * This is the inbound data path.  Packets are passed upstream as
3549 	 * T_UNITDATA_IND messages with full IP headers still attached.
3550 	 */
3551 	if (icmp->icmp_family == AF_INET) {
3552 		ASSERT(ipvers == IPV4_VERSION);
3553 		udi_size =  sizeof (struct T_unitdata_ind) + sizeof (sin_t);
3554 		if (icmp->icmp_recvif && (pinfo != NULL) &&
3555 		    (pinfo->ip_pkt_flags & IPF_RECVIF)) {
3556 			udi_size += sizeof (struct T_opthdr) +
3557 			    sizeof (uint_t);
3558 		}
3559 
3560 		if (icmp->icmp_ip_recvpktinfo && (pinfo != NULL) &&
3561 		    (pinfo->ip_pkt_flags & IPF_RECVADDR)) {
3562 			udi_size += sizeof (struct T_opthdr) +
3563 			    sizeof (struct in_pktinfo);
3564 		}
3565 
3566 		/*
3567 		 * If SO_TIMESTAMP is set allocate the appropriate sized
3568 		 * buffer. Since gethrestime() expects a pointer aligned
3569 		 * argument, we allocate space necessary for extra
3570 		 * alignment (even though it might not be used).
3571 		 */
3572 		if (icmp->icmp_timestamp) {
3573 			udi_size += sizeof (struct T_opthdr) +
3574 			    sizeof (timestruc_t) + _POINTER_ALIGNMENT;
3575 		}
3576 		mp1 = allocb(udi_size, BPRI_MED);
3577 		if (mp1 == NULL) {
3578 			freemsg(mp);
3579 			if (options_mp != NULL)
3580 				freeb(options_mp);
3581 			BUMP_MIB(&is->is_rawip_mib, rawipInErrors);
3582 			return;
3583 		}
3584 		mp1->b_cont = mp;
3585 		mp = mp1;
3586 		tudi = (struct T_unitdata_ind *)mp->b_rptr;
3587 		mp->b_datap->db_type = M_PROTO;
3588 		mp->b_wptr = (uchar_t *)tudi + udi_size;
3589 		tudi->PRIM_type = T_UNITDATA_IND;
3590 		tudi->SRC_length = sizeof (sin_t);
3591 		tudi->SRC_offset = sizeof (struct T_unitdata_ind);
3592 		sin = (sin_t *)&tudi[1];
3593 		*sin = sin_null;
3594 		sin->sin_family = AF_INET;
3595 		sin->sin_addr.s_addr = ipha->ipha_src;
3596 		tudi->OPT_offset =  sizeof (struct T_unitdata_ind) +
3597 		    sizeof (sin_t);
3598 		udi_size -= (sizeof (struct T_unitdata_ind) + sizeof (sin_t));
3599 		tudi->OPT_length = udi_size;
3600 
3601 		/*
3602 		 * Add options if IP_RECVIF is set
3603 		 */
3604 		if (udi_size != 0) {
3605 			char *dstopt;
3606 
3607 			dstopt = (char *)&sin[1];
3608 			if (icmp->icmp_recvif && (pinfo != NULL) &&
3609 			    (pinfo->ip_pkt_flags & IPF_RECVIF)) {
3610 
3611 				struct T_opthdr *toh;
3612 				uint_t		*dstptr;
3613 
3614 				toh = (struct T_opthdr *)dstopt;
3615 				toh->level = IPPROTO_IP;
3616 				toh->name = IP_RECVIF;
3617 				toh->len = sizeof (struct T_opthdr) +
3618 				    sizeof (uint_t);
3619 				toh->status = 0;
3620 				dstopt += sizeof (struct T_opthdr);
3621 				dstptr = (uint_t *)dstopt;
3622 				*dstptr = pinfo->ip_pkt_ifindex;
3623 				dstopt += sizeof (uint_t);
3624 				udi_size -= toh->len;
3625 			}
3626 			if (icmp->icmp_timestamp) {
3627 				struct	T_opthdr *toh;
3628 
3629 				toh = (struct T_opthdr *)dstopt;
3630 				toh->level = SOL_SOCKET;
3631 				toh->name = SCM_TIMESTAMP;
3632 				toh->len = sizeof (struct T_opthdr) +
3633 				    sizeof (timestruc_t) + _POINTER_ALIGNMENT;
3634 				toh->status = 0;
3635 				dstopt += sizeof (struct T_opthdr);
3636 				/* Align for gethrestime() */
3637 				dstopt = (char *)P2ROUNDUP((intptr_t)dstopt,
3638 				    sizeof (intptr_t));
3639 				gethrestime((timestruc_t *)dstopt);
3640 				dstopt = (char *)toh + toh->len;
3641 				udi_size -= toh->len;
3642 			}
3643 			if (icmp->icmp_ip_recvpktinfo && (pinfo != NULL) &&
3644 			    (pinfo->ip_pkt_flags & IPF_RECVADDR)) {
3645 				struct	T_opthdr *toh;
3646 				struct	in_pktinfo *pktinfop;
3647 
3648 				toh = (struct T_opthdr *)dstopt;
3649 				toh->level = IPPROTO_IP;
3650 				toh->name = IP_PKTINFO;
3651 				toh->len = sizeof (struct T_opthdr) +
3652 				    sizeof (in_pktinfo_t);
3653 				toh->status = 0;
3654 				dstopt += sizeof (struct T_opthdr);
3655 				pktinfop = (struct in_pktinfo *)dstopt;
3656 				pktinfop->ipi_ifindex = pinfo->ip_pkt_ifindex;
3657 				pktinfop->ipi_spec_dst =
3658 				    pinfo->ip_pkt_match_addr;
3659 
3660 				pktinfop->ipi_addr.s_addr = ipha->ipha_dst;
3661 
3662 				dstopt += sizeof (struct in_pktinfo);
3663 				udi_size -= toh->len;
3664 			}
3665 
3666 			/* Consumed all of allocated space */
3667 			ASSERT(udi_size == 0);
3668 		}
3669 
3670 		if (options_mp != NULL)
3671 			freeb(options_mp);
3672 
3673 		BUMP_MIB(&is->is_rawip_mib, rawipInDatagrams);
3674 		goto deliver;
3675 	}
3676 
3677 	/*
3678 	 * We don't need options_mp in the IPv6 path.
3679 	 */
3680 	if (options_mp != NULL) {
3681 		freeb(options_mp);
3682 		options_mp = NULL;
3683 	}
3684 
3685 	/*
3686 	 * Discard message if it is smaller than the IPv6 header
3687 	 * or if the header is malformed.
3688 	 */
3689 	if ((mp->b_wptr - rptr) < sizeof (ip6_t) ||
3690 	    IPH_HDR_VERSION((ipha_t *)rptr) != IPV6_VERSION ||
3691 	    icmp->icmp_family != AF_INET6) {
3692 		freemsg(mp);
3693 		BUMP_MIB(&is->is_rawip_mib, rawipInErrors);
3694 		return;
3695 	}
3696 
3697 	/* Initialize */
3698 	ipp.ipp_fields = 0;
3699 	hopstrip = 0;
3700 
3701 	ip6h = (ip6_t *)rptr;
3702 	/*
3703 	 * Call on ip_find_hdr_v6 which gets the total hdr len
3704 	 * as well as individual lenghts of ext hdrs (and ptrs to
3705 	 * them).
3706 	 */
3707 	if (ip6h->ip6_nxt != icmp->icmp_proto) {
3708 		/* Look for ifindex information */
3709 		if (ip6h->ip6_nxt == IPPROTO_RAW) {
3710 			ip6i = (ip6i_t *)ip6h;
3711 			if (ip6i->ip6i_flags & IP6I_IFINDEX) {
3712 				ASSERT(ip6i->ip6i_ifindex != 0);
3713 				ipp.ipp_fields |= IPPF_IFINDEX;
3714 				ipp.ipp_ifindex = ip6i->ip6i_ifindex;
3715 			}
3716 			rptr = (uchar_t *)&ip6i[1];
3717 			mp->b_rptr = rptr;
3718 			if (rptr == mp->b_wptr) {
3719 				mp1 = mp->b_cont;
3720 				freeb(mp);
3721 				mp = mp1;
3722 				rptr = mp->b_rptr;
3723 			}
3724 			ASSERT(mp->b_wptr - rptr >= IPV6_HDR_LEN);
3725 			ip6h = (ip6_t *)rptr;
3726 		}
3727 		hdr_len = ip_find_hdr_v6(mp, ip6h, &ipp, &nexthdr);
3728 
3729 		/*
3730 		 * We need to lie a bit to the user because users inside
3731 		 * labeled compartments should not see their own labels.  We
3732 		 * assume that in all other respects IP has checked the label,
3733 		 * and that the label is always first among the options.  (If
3734 		 * it's not first, then this code won't see it, and the option
3735 		 * will be passed along to the user.)
3736 		 *
3737 		 * If we had multilevel ICMP sockets, then the following code
3738 		 * should be skipped for them to allow the user to see the
3739 		 * label.
3740 		 *
3741 		 * Alignment restrictions in the definition of IP options
3742 		 * (namely, the requirement that the 4-octet DOI goes on a
3743 		 * 4-octet boundary) mean that we know exactly where the option
3744 		 * should start, but we're lenient for other hosts.
3745 		 *
3746 		 * Note that there are no multilevel ICMP or raw IP sockets
3747 		 * yet, thus nobody ever sees the IP6OPT_LS option.
3748 		 */
3749 		if ((ipp.ipp_fields & IPPF_HOPOPTS) &&
3750 		    ipp.ipp_hopoptslen > 5 && is_system_labeled()) {
3751 			const uchar_t *ucp =
3752 			    (const uchar_t *)ipp.ipp_hopopts + 2;
3753 			int remlen = ipp.ipp_hopoptslen - 2;
3754 
3755 			while (remlen > 0) {
3756 				if (*ucp == IP6OPT_PAD1) {
3757 					remlen--;
3758 					ucp++;
3759 				} else if (*ucp == IP6OPT_PADN) {
3760 					remlen -= ucp[1] + 2;
3761 					ucp += ucp[1] + 2;
3762 				} else if (*ucp == ip6opt_ls) {
3763 					hopstrip = (ucp -
3764 					    (const uchar_t *)ipp.ipp_hopopts) +
3765 					    ucp[1] + 2;
3766 					hopstrip = (hopstrip + 7) & ~7;
3767 					break;
3768 				} else {
3769 					/* label option must be first */
3770 					break;
3771 				}
3772 			}
3773 		}
3774 	} else {
3775 		hdr_len = IPV6_HDR_LEN;
3776 		ip6i = NULL;
3777 		nexthdr = ip6h->ip6_nxt;
3778 	}
3779 	/*
3780 	 * One special case where IP attaches the IRE needs to
3781 	 * be handled so that we don't send up IRE to the user land.
3782 	 */
3783 	if (nexthdr == IPPROTO_TCP) {
3784 		tcph_t *tcph = (tcph_t *)&mp->b_rptr[hdr_len];
3785 
3786 		if (((tcph->th_flags[0] & (TH_SYN|TH_ACK)) == TH_SYN) &&
3787 		    mp->b_cont != NULL) {
3788 			mp1 = mp->b_cont;
3789 			if (mp1->b_datap->db_type == IRE_DB_TYPE) {
3790 				freeb(mp1);
3791 				mp->b_cont = NULL;
3792 			}
3793 		}
3794 	}
3795 	/*
3796 	 * Check a filter for ICMPv6 types if needed.
3797 	 * Verify raw checksums if needed.
3798 	 */
3799 	if (icmp->icmp_filter != NULL || icmp->icmp_raw_checksum) {
3800 		if (icmp->icmp_filter != NULL) {
3801 			int type;
3802 
3803 			/* Assumes that IP has done the pullupmsg */
3804 			type = mp->b_rptr[hdr_len];
3805 
3806 			ASSERT(mp->b_rptr + hdr_len <= mp->b_wptr);
3807 			if (ICMP6_FILTER_WILLBLOCK(type, icmp->icmp_filter)) {
3808 				freemsg(mp);
3809 				return;
3810 			}
3811 		} else {
3812 			/* Checksum */
3813 			uint16_t	*up;
3814 			uint32_t	sum;
3815 			int		remlen;
3816 
3817 			up = (uint16_t *)&ip6h->ip6_src;
3818 
3819 			remlen = msgdsize(mp) - hdr_len;
3820 			sum = htons(icmp->icmp_proto + remlen)
3821 			    + up[0] + up[1] + up[2] + up[3]
3822 			    + up[4] + up[5] + up[6] + up[7]
3823 			    + up[8] + up[9] + up[10] + up[11]
3824 			    + up[12] + up[13] + up[14] + up[15];
3825 			sum = (sum & 0xffff) + (sum >> 16);
3826 			sum = IP_CSUM(mp, hdr_len, sum);
3827 			if (sum != 0) {
3828 				/* IPv6 RAW checksum failed */
3829 				ip0dbg(("icmp_rput: RAW checksum "
3830 				    "failed %x\n", sum));
3831 				freemsg(mp);
3832 				BUMP_MIB(&is->is_rawip_mib,
3833 				    rawipInCksumErrs);
3834 				return;
3835 			}
3836 		}
3837 	}
3838 	/* Skip all the IPv6 headers per API */
3839 	mp->b_rptr += hdr_len;
3840 
3841 	udi_size = sizeof (struct T_unitdata_ind) + sizeof (sin6_t);
3842 
3843 	/*
3844 	 * We use local variables icmp_opt and icmp_ipv6_recvhoplimit to
3845 	 * maintain state information, instead of relying on icmp_t
3846 	 * structure, since there arent any locks protecting these members
3847 	 * and there is a window where there might be a race between a
3848 	 * thread setting options on the write side and a thread reading
3849 	 * these options on the read size.
3850 	 */
3851 	if (ipp.ipp_fields & (IPPF_HOPOPTS|IPPF_DSTOPTS|IPPF_RTDSTOPTS|
3852 	    IPPF_RTHDR|IPPF_IFINDEX)) {
3853 		if (icmp->icmp_ipv6_recvhopopts &&
3854 		    (ipp.ipp_fields & IPPF_HOPOPTS) &&
3855 		    ipp.ipp_hopoptslen > hopstrip) {
3856 			udi_size += sizeof (struct T_opthdr) +
3857 			    ipp.ipp_hopoptslen - hopstrip;
3858 			icmp_opt |= IPPF_HOPOPTS;
3859 		}
3860 		if ((icmp->icmp_ipv6_recvdstopts ||
3861 		    icmp->icmp_old_ipv6_recvdstopts) &&
3862 		    (ipp.ipp_fields & IPPF_DSTOPTS)) {
3863 			udi_size += sizeof (struct T_opthdr) +
3864 			    ipp.ipp_dstoptslen;
3865 			icmp_opt |= IPPF_DSTOPTS;
3866 		}
3867 		if (((icmp->icmp_ipv6_recvdstopts &&
3868 		    icmp->icmp_ipv6_recvrthdr &&
3869 		    (ipp.ipp_fields & IPPF_RTHDR)) ||
3870 		    icmp->icmp_ipv6_recvrtdstopts) &&
3871 		    (ipp.ipp_fields & IPPF_RTDSTOPTS)) {
3872 			udi_size += sizeof (struct T_opthdr) +
3873 			    ipp.ipp_rtdstoptslen;
3874 			icmp_opt |= IPPF_RTDSTOPTS;
3875 		}
3876 		if (icmp->icmp_ipv6_recvrthdr &&
3877 		    (ipp.ipp_fields & IPPF_RTHDR)) {
3878 			udi_size += sizeof (struct T_opthdr) +
3879 			    ipp.ipp_rthdrlen;
3880 			icmp_opt |= IPPF_RTHDR;
3881 		}
3882 		if (icmp->icmp_ip_recvpktinfo &&
3883 		    (ipp.ipp_fields & IPPF_IFINDEX)) {
3884 			udi_size += sizeof (struct T_opthdr) +
3885 			    sizeof (struct in6_pktinfo);
3886 			icmp_opt |= IPPF_IFINDEX;
3887 		}
3888 	}
3889 	if (icmp->icmp_ipv6_recvhoplimit) {
3890 		udi_size += sizeof (struct T_opthdr) + sizeof (int);
3891 		icmp_ipv6_recvhoplimit = B_TRUE;
3892 	}
3893 
3894 	if (icmp->icmp_ipv6_recvtclass)
3895 		udi_size += sizeof (struct T_opthdr) + sizeof (int);
3896 
3897 	/*
3898 	 * If SO_TIMESTAMP is set allocate the appropriate sized
3899 	 * buffer. Since gethrestime() expects a pointer aligned
3900 	 * argument, we allocate space necessary for extra
3901 	 * alignment (even though it might not be used).
3902 	 */
3903 	if (icmp->icmp_timestamp) {
3904 		udi_size += sizeof (struct T_opthdr) +
3905 		    sizeof (timestruc_t) + _POINTER_ALIGNMENT;
3906 	}
3907 
3908 	mp1 = allocb(udi_size, BPRI_MED);
3909 	if (mp1 == NULL) {
3910 		freemsg(mp);
3911 		BUMP_MIB(&is->is_rawip_mib, rawipInErrors);
3912 		return;
3913 	}
3914 	mp1->b_cont = mp;
3915 	mp = mp1;
3916 	mp->b_datap->db_type = M_PROTO;
3917 	tudi = (struct T_unitdata_ind *)mp->b_rptr;
3918 	mp->b_wptr = (uchar_t *)tudi + udi_size;
3919 	tudi->PRIM_type = T_UNITDATA_IND;
3920 	tudi->SRC_length = sizeof (sin6_t);
3921 	tudi->SRC_offset = sizeof (struct T_unitdata_ind);
3922 	tudi->OPT_offset = sizeof (struct T_unitdata_ind) + sizeof (sin6_t);
3923 	udi_size -= (sizeof (struct T_unitdata_ind) + sizeof (sin6_t));
3924 	tudi->OPT_length = udi_size;
3925 	sin6 = (sin6_t *)&tudi[1];
3926 	sin6->sin6_port = 0;
3927 	sin6->sin6_family = AF_INET6;
3928 
3929 	sin6->sin6_addr = ip6h->ip6_src;
3930 	/* No sin6_flowinfo per API */
3931 	sin6->sin6_flowinfo = 0;
3932 	/* For link-scope source pass up scope id */
3933 	if ((ipp.ipp_fields & IPPF_IFINDEX) &&
3934 	    IN6_IS_ADDR_LINKSCOPE(&ip6h->ip6_src))
3935 		sin6->sin6_scope_id = ipp.ipp_ifindex;
3936 	else
3937 		sin6->sin6_scope_id = 0;
3938 
3939 	sin6->__sin6_src_id = ip_srcid_find_addr(&ip6h->ip6_dst,
3940 	    icmp->icmp_zoneid, is->is_netstack);
3941 
3942 	if (udi_size != 0) {
3943 		uchar_t *dstopt;
3944 
3945 		dstopt = (uchar_t *)&sin6[1];
3946 		if (icmp_opt & IPPF_IFINDEX) {
3947 			struct T_opthdr *toh;
3948 			struct in6_pktinfo *pkti;
3949 
3950 			toh = (struct T_opthdr *)dstopt;
3951 			toh->level = IPPROTO_IPV6;
3952 			toh->name = IPV6_PKTINFO;
3953 			toh->len = sizeof (struct T_opthdr) +
3954 			    sizeof (*pkti);
3955 			toh->status = 0;
3956 			dstopt += sizeof (struct T_opthdr);
3957 			pkti = (struct in6_pktinfo *)dstopt;
3958 			pkti->ipi6_addr = ip6h->ip6_dst;
3959 			pkti->ipi6_ifindex = ipp.ipp_ifindex;
3960 			dstopt += sizeof (*pkti);
3961 			udi_size -= toh->len;
3962 		}
3963 		if (icmp_ipv6_recvhoplimit) {
3964 			struct T_opthdr *toh;
3965 
3966 			toh = (struct T_opthdr *)dstopt;
3967 			toh->level = IPPROTO_IPV6;
3968 			toh->name = IPV6_HOPLIMIT;
3969 			toh->len = sizeof (struct T_opthdr) +
3970 			    sizeof (uint_t);
3971 			toh->status = 0;
3972 			dstopt += sizeof (struct T_opthdr);
3973 			*(uint_t *)dstopt = ip6h->ip6_hops;
3974 			dstopt += sizeof (uint_t);
3975 			udi_size -= toh->len;
3976 		}
3977 		if (icmp->icmp_ipv6_recvtclass) {
3978 			struct T_opthdr *toh;
3979 
3980 			toh = (struct T_opthdr *)dstopt;
3981 			toh->level = IPPROTO_IPV6;
3982 			toh->name = IPV6_TCLASS;
3983 			toh->len = sizeof (struct T_opthdr) +
3984 			    sizeof (uint_t);
3985 			toh->status = 0;
3986 			dstopt += sizeof (struct T_opthdr);
3987 			*(uint_t *)dstopt = IPV6_FLOW_TCLASS(ip6h->ip6_flow);
3988 			dstopt += sizeof (uint_t);
3989 			udi_size -= toh->len;
3990 		}
3991 		if (icmp->icmp_timestamp) {
3992 			struct  T_opthdr *toh;
3993 
3994 			toh = (struct T_opthdr *)dstopt;
3995 			toh->level = SOL_SOCKET;
3996 			toh->name = SCM_TIMESTAMP;
3997 			toh->len = sizeof (struct T_opthdr) +
3998 			    sizeof (timestruc_t) + _POINTER_ALIGNMENT;
3999 			toh->status = 0;
4000 			dstopt += sizeof (struct T_opthdr);
4001 			/* Align for gethrestime() */
4002 			dstopt = (uchar_t *)P2ROUNDUP((intptr_t)dstopt,
4003 			    sizeof (intptr_t));
4004 			gethrestime((timestruc_t *)dstopt);
4005 			dstopt = (uchar_t *)toh + toh->len;
4006 			udi_size -= toh->len;
4007 		}
4008 
4009 		if (icmp_opt & IPPF_HOPOPTS) {
4010 			struct T_opthdr *toh;
4011 
4012 			toh = (struct T_opthdr *)dstopt;
4013 			toh->level = IPPROTO_IPV6;
4014 			toh->name = IPV6_HOPOPTS;
4015 			toh->len = sizeof (struct T_opthdr) +
4016 			    ipp.ipp_hopoptslen - hopstrip;
4017 			toh->status = 0;
4018 			dstopt += sizeof (struct T_opthdr);
4019 			bcopy((char *)ipp.ipp_hopopts + hopstrip, dstopt,
4020 			    ipp.ipp_hopoptslen - hopstrip);
4021 			if (hopstrip > 0) {
4022 				/* copy next header value and fake length */
4023 				dstopt[0] = ((uchar_t *)ipp.ipp_hopopts)[0];
4024 				dstopt[1] = ((uchar_t *)ipp.ipp_hopopts)[1] -
4025 				    hopstrip / 8;
4026 			}
4027 			dstopt += ipp.ipp_hopoptslen - hopstrip;
4028 			udi_size -= toh->len;
4029 		}
4030 		if (icmp_opt & IPPF_RTDSTOPTS) {
4031 			struct T_opthdr *toh;
4032 
4033 			toh = (struct T_opthdr *)dstopt;
4034 			toh->level = IPPROTO_IPV6;
4035 			toh->name = IPV6_DSTOPTS;
4036 			toh->len = sizeof (struct T_opthdr) +
4037 			    ipp.ipp_rtdstoptslen;
4038 			toh->status = 0;
4039 			dstopt += sizeof (struct T_opthdr);
4040 			bcopy(ipp.ipp_rtdstopts, dstopt,
4041 			    ipp.ipp_rtdstoptslen);
4042 			dstopt += ipp.ipp_rtdstoptslen;
4043 			udi_size -= toh->len;
4044 		}
4045 		if (icmp_opt & IPPF_RTHDR) {
4046 			struct T_opthdr *toh;
4047 
4048 			toh = (struct T_opthdr *)dstopt;
4049 			toh->level = IPPROTO_IPV6;
4050 			toh->name = IPV6_RTHDR;
4051 			toh->len = sizeof (struct T_opthdr) +
4052 			    ipp.ipp_rthdrlen;
4053 			toh->status = 0;
4054 			dstopt += sizeof (struct T_opthdr);
4055 			bcopy(ipp.ipp_rthdr, dstopt, ipp.ipp_rthdrlen);
4056 			dstopt += ipp.ipp_rthdrlen;
4057 			udi_size -= toh->len;
4058 		}
4059 		if (icmp_opt & IPPF_DSTOPTS) {
4060 			struct T_opthdr *toh;
4061 
4062 			toh = (struct T_opthdr *)dstopt;
4063 			toh->level = IPPROTO_IPV6;
4064 			toh->name = IPV6_DSTOPTS;
4065 			toh->len = sizeof (struct T_opthdr) +
4066 			    ipp.ipp_dstoptslen;
4067 			toh->status = 0;
4068 			dstopt += sizeof (struct T_opthdr);
4069 			bcopy(ipp.ipp_dstopts, dstopt,
4070 			    ipp.ipp_dstoptslen);
4071 			dstopt += ipp.ipp_dstoptslen;
4072 			udi_size -= toh->len;
4073 		}
4074 		/* Consumed all of allocated space */
4075 		ASSERT(udi_size == 0);
4076 	}
4077 	BUMP_MIB(&is->is_rawip_mib, rawipInDatagrams);
4078 
4079 deliver:
4080 	icmp_ulp_recv(connp, mp);
4081 
4082 }
4083 
4084 /*
4085  * return SNMP stuff in buffer in mpdata
4086  */
4087 mblk_t *
4088 icmp_snmp_get(queue_t *q, mblk_t *mpctl)
4089 {
4090 	mblk_t			*mpdata;
4091 	struct opthdr		*optp;
4092 	conn_t			*connp = Q_TO_CONN(q);
4093 	icmp_stack_t		*is = connp->conn_netstack->netstack_icmp;
4094 	mblk_t			*mp2ctl;
4095 
4096 	/*
4097 	 * make a copy of the original message
4098 	 */
4099 	mp2ctl = copymsg(mpctl);
4100 
4101 	if (mpctl == NULL ||
4102 	    (mpdata = mpctl->b_cont) == NULL) {
4103 		freemsg(mpctl);
4104 		freemsg(mp2ctl);
4105 		return (0);
4106 	}
4107 
4108 	/* fixed length structure for IPv4 and IPv6 counters */
4109 	optp = (struct opthdr *)&mpctl->b_rptr[sizeof (struct T_optmgmt_ack)];
4110 	optp->level = EXPER_RAWIP;
4111 	optp->name = 0;
4112 	(void) snmp_append_data(mpdata, (char *)&is->is_rawip_mib,
4113 	    sizeof (is->is_rawip_mib));
4114 	optp->len = msgdsize(mpdata);
4115 	qreply(q, mpctl);
4116 
4117 	return (mp2ctl);
4118 }
4119 
4120 /*
4121  * Return 0 if invalid set request, 1 otherwise, including non-rawip requests.
4122  * TODO:  If this ever actually tries to set anything, it needs to be
4123  * to do the appropriate locking.
4124  */
4125 /* ARGSUSED */
4126 int
4127 icmp_snmp_set(queue_t *q, t_scalar_t level, t_scalar_t name,
4128     uchar_t *ptr, int len)
4129 {
4130 	switch (level) {
4131 	case EXPER_RAWIP:
4132 		return (0);
4133 	default:
4134 		return (1);
4135 	}
4136 }
4137 
4138 /* Report for ndd "icmp_status" */
4139 /* ARGSUSED */
4140 static int
4141 icmp_status_report(queue_t *q, mblk_t *mp, caddr_t cp, cred_t *cr)
4142 {
4143 	conn_t  *connp;
4144 	ip_stack_t *ipst;
4145 	char	laddrbuf[INET6_ADDRSTRLEN];
4146 	char	faddrbuf[INET6_ADDRSTRLEN];
4147 	int	i;
4148 
4149 	(void) mi_mpprintf(mp,
4150 	    "RAWIP    " MI_COL_HDRPAD_STR
4151 	/*   01234567[89ABCDEF] */
4152 	    "  src addr        dest addr       state");
4153 	/*   xxx.xxx.xxx.xxx xxx.xxx.xxx.xxx UNBOUND */
4154 
4155 	connp = Q_TO_CONN(q);
4156 	ipst = connp->conn_netstack->netstack_ip;
4157 	for (i = 0; i < CONN_G_HASH_SIZE; i++) {
4158 		connf_t *connfp;
4159 		char	*state;
4160 
4161 		connfp = &ipst->ips_ipcl_globalhash_fanout[i];
4162 		connp = NULL;
4163 
4164 		while ((connp = ipcl_get_next_conn(connfp, connp,
4165 		    IPCL_RAWIPCONN)) != NULL) {
4166 			icmp_t  *icmp;
4167 
4168 			mutex_enter(&(connp)->conn_lock);
4169 			icmp = connp->conn_icmp;
4170 
4171 			if (icmp->icmp_state == TS_UNBND)
4172 				state = "UNBOUND";
4173 			else if (icmp->icmp_state == TS_IDLE)
4174 				state = "IDLE";
4175 			else if (icmp->icmp_state == TS_DATA_XFER)
4176 				state = "CONNECTED";
4177 			else
4178 				state = "UnkState";
4179 
4180 			(void) mi_mpprintf(mp, MI_COL_PTRFMT_STR "%s %s %s",
4181 			    (void *)icmp,
4182 			    inet_ntop(AF_INET6, &icmp->icmp_v6dst.sin6_addr,
4183 			    faddrbuf,
4184 			    sizeof (faddrbuf)),
4185 			    inet_ntop(AF_INET6, &icmp->icmp_v6src, laddrbuf,
4186 			    sizeof (laddrbuf)),
4187 			    state);
4188 			mutex_exit(&(connp)->conn_lock);
4189 		}
4190 	}
4191 	return (0);
4192 }
4193 
4194 /*
4195  * This routine creates a T_UDERROR_IND message and passes it upstream.
4196  * The address and options are copied from the T_UNITDATA_REQ message
4197  * passed in mp.  This message is freed.
4198  */
4199 static void
4200 icmp_ud_err(queue_t *q, mblk_t *mp, t_scalar_t err)
4201 {
4202 	mblk_t	*mp1;
4203 	uchar_t	*rptr = mp->b_rptr;
4204 	struct T_unitdata_req *tudr = (struct T_unitdata_req *)rptr;
4205 
4206 	mp1 = mi_tpi_uderror_ind((char *)&rptr[tudr->DEST_offset],
4207 	    tudr->DEST_length, (char *)&rptr[tudr->OPT_offset],
4208 	    tudr->OPT_length, err);
4209 	if (mp1)
4210 		qreply(q, mp1);
4211 	freemsg(mp);
4212 }
4213 
4214 
4215 static int
4216 rawip_do_unbind(conn_t *connp)
4217 {
4218 	icmp_t *icmp = connp->conn_icmp;
4219 
4220 	rw_enter(&icmp->icmp_rwlock, RW_WRITER);
4221 	/* If a bind has not been done, we can't unbind. */
4222 	if (icmp->icmp_state == TS_UNBND || icmp->icmp_pending_op != -1) {
4223 		rw_exit(&icmp->icmp_rwlock);
4224 		return (-TOUTSTATE);
4225 	}
4226 	icmp->icmp_pending_op = T_UNBIND_REQ;
4227 	rw_exit(&icmp->icmp_rwlock);
4228 
4229 	/*
4230 	 * Call ip to unbind
4231 	 */
4232 
4233 	ip_unbind(connp);
4234 
4235 	/*
4236 	 * Once we're unbound from IP, the pending operation may be cleared
4237 	 * here.
4238 	 */
4239 	rw_enter(&icmp->icmp_rwlock, RW_WRITER);
4240 	V6_SET_ZERO(icmp->icmp_v6src);
4241 	V6_SET_ZERO(icmp->icmp_bound_v6src);
4242 	icmp->icmp_pending_op = -1;
4243 	icmp->icmp_state = TS_UNBND;
4244 	if (icmp->icmp_family == AF_INET6)
4245 		(void) icmp_build_hdrs(icmp);
4246 	rw_exit(&icmp->icmp_rwlock);
4247 	return (0);
4248 }
4249 
4250 /*
4251  * This routine is called by icmp_wput to handle T_UNBIND_REQ messages.
4252  * After some error checking, the message is passed downstream to ip.
4253  */
4254 static void
4255 icmp_tpi_unbind(queue_t *q, mblk_t *mp)
4256 {
4257 	conn_t	*connp = Q_TO_CONN(q);
4258 	int	error;
4259 
4260 	ASSERT(mp->b_cont == NULL);
4261 	error = rawip_do_unbind(connp);
4262 	if (error) {
4263 		if (error < 0) {
4264 			icmp_err_ack(q, mp, -error, 0);
4265 		} else {
4266 			icmp_err_ack(q, mp, 0, error);
4267 		}
4268 		return;
4269 	}
4270 
4271 	/*
4272 	 * Convert mp into a T_OK_ACK
4273 	 */
4274 
4275 	mp = mi_tpi_ok_ack_alloc(mp);
4276 
4277 	/*
4278 	 * should not happen in practice... T_OK_ACK is smaller than the
4279 	 * original message.
4280 	 */
4281 	ASSERT(mp != NULL);
4282 	ASSERT(((struct T_ok_ack *)mp->b_rptr)->PRIM_type == T_OK_ACK);
4283 	qreply(q, mp);
4284 }
4285 
4286 
4287 /*
4288  * Process IPv4 packets that already include an IP header.
4289  * Used when IP_HDRINCL has been set (implicit for IPPROTO_RAW and
4290  * IPPROTO_IGMP).
4291  */
4292 static int
4293 icmp_wput_hdrincl(queue_t *q, conn_t *connp, mblk_t *mp, icmp_t *icmp,
4294     ip4_pkt_t *pktinfop)
4295 {
4296 	icmp_stack_t *is = icmp->icmp_is;
4297 	ipha_t	*ipha;
4298 	int	ip_hdr_length;
4299 	int	tp_hdr_len;
4300 	mblk_t	*mp1;
4301 	uint_t	pkt_len;
4302 	ip_opt_info_t optinfo;
4303 
4304 	optinfo.ip_opt_flags = 0;
4305 	optinfo.ip_opt_ill_index = 0;
4306 	ipha = (ipha_t *)mp->b_rptr;
4307 	ip_hdr_length = IP_SIMPLE_HDR_LENGTH + icmp->icmp_ip_snd_options_len;
4308 	if ((mp->b_wptr - mp->b_rptr) < IP_SIMPLE_HDR_LENGTH) {
4309 		if (!pullupmsg(mp, IP_SIMPLE_HDR_LENGTH)) {
4310 			ASSERT(icmp != NULL);
4311 			BUMP_MIB(&is->is_rawip_mib, rawipOutErrors);
4312 			freemsg(mp);
4313 			return (0);
4314 		}
4315 		ipha = (ipha_t *)mp->b_rptr;
4316 	}
4317 	ipha->ipha_version_and_hdr_length =
4318 	    (IP_VERSION<<4) | (ip_hdr_length>>2);
4319 
4320 	/*
4321 	 * For the socket of SOCK_RAW type, the checksum is provided in the
4322 	 * pre-built packet. We set the ipha_ident field to IP_HDR_INCLUDED to
4323 	 * tell IP that the application has sent a complete IP header and not
4324 	 * to compute the transport checksum nor change the DF flag.
4325 	 */
4326 	ipha->ipha_ident = IP_HDR_INCLUDED;
4327 	ipha->ipha_hdr_checksum = 0;
4328 	ipha->ipha_fragment_offset_and_flags &= htons(IPH_DF);
4329 	/* Insert options if any */
4330 	if (ip_hdr_length > IP_SIMPLE_HDR_LENGTH) {
4331 		/*
4332 		 * Put the IP header plus any transport header that is
4333 		 * checksumed by ip_wput into the first mblk. (ip_wput assumes
4334 		 * that at least the checksum field is in the first mblk.)
4335 		 */
4336 		switch (ipha->ipha_protocol) {
4337 		case IPPROTO_UDP:
4338 			tp_hdr_len = 8;
4339 			break;
4340 		case IPPROTO_TCP:
4341 			tp_hdr_len = 20;
4342 			break;
4343 		default:
4344 			tp_hdr_len = 0;
4345 			break;
4346 		}
4347 		/*
4348 		 * The code below assumes that IP_SIMPLE_HDR_LENGTH plus
4349 		 * tp_hdr_len bytes will be in a single mblk.
4350 		 */
4351 		if ((mp->b_wptr - mp->b_rptr) < (IP_SIMPLE_HDR_LENGTH +
4352 		    tp_hdr_len)) {
4353 			if (!pullupmsg(mp, IP_SIMPLE_HDR_LENGTH +
4354 			    tp_hdr_len)) {
4355 				BUMP_MIB(&is->is_rawip_mib,
4356 				    rawipOutErrors);
4357 				freemsg(mp);
4358 				return (0);
4359 			}
4360 			ipha = (ipha_t *)mp->b_rptr;
4361 		}
4362 
4363 		/*
4364 		 * if the length is larger then the max allowed IP packet,
4365 		 * then send an error and abort the processing.
4366 		 */
4367 		pkt_len = ntohs(ipha->ipha_length)
4368 		    + icmp->icmp_ip_snd_options_len;
4369 		if (pkt_len > IP_MAXPACKET) {
4370 			return (EMSGSIZE);
4371 		}
4372 		if (!(mp1 = allocb(ip_hdr_length + is->is_wroff_extra +
4373 		    tp_hdr_len, BPRI_LO))) {
4374 			return (ENOMEM);
4375 		}
4376 		mp1->b_rptr += is->is_wroff_extra;
4377 		mp1->b_wptr = mp1->b_rptr + ip_hdr_length;
4378 
4379 		ipha->ipha_length = htons((uint16_t)pkt_len);
4380 		bcopy(ipha, mp1->b_rptr, IP_SIMPLE_HDR_LENGTH);
4381 
4382 		/* Copy transport header if any */
4383 		bcopy(&ipha[1], mp1->b_wptr, tp_hdr_len);
4384 		mp1->b_wptr += tp_hdr_len;
4385 
4386 		/* Add options */
4387 		ipha = (ipha_t *)mp1->b_rptr;
4388 		bcopy(icmp->icmp_ip_snd_options, &ipha[1],
4389 		    icmp->icmp_ip_snd_options_len);
4390 
4391 		/* Drop IP header and transport header from original */
4392 		(void) adjmsg(mp, IP_SIMPLE_HDR_LENGTH + tp_hdr_len);
4393 
4394 		mp1->b_cont = mp;
4395 		mp = mp1;
4396 		/*
4397 		 * Massage source route putting first source
4398 		 * route in ipha_dst.
4399 		 */
4400 		(void) ip_massage_options(ipha, is->is_netstack);
4401 	}
4402 
4403 	if (pktinfop != NULL) {
4404 		/*
4405 		 * Over write the source address provided in the header
4406 		 */
4407 		if (pktinfop->ip4_addr != INADDR_ANY) {
4408 			ipha->ipha_src = pktinfop->ip4_addr;
4409 			optinfo.ip_opt_flags = IP_VERIFY_SRC;
4410 		}
4411 
4412 		if (pktinfop->ip4_ill_index != 0) {
4413 			optinfo.ip_opt_ill_index = pktinfop->ip4_ill_index;
4414 		}
4415 	}
4416 
4417 	ip_output_options(connp, mp, q, IP_WPUT, &optinfo);
4418 	return (0);
4419 }
4420 
4421 static int
4422 icmp_update_label(icmp_t *icmp, mblk_t *mp, ipaddr_t dst)
4423 {
4424 	int err;
4425 	uchar_t opt_storage[IP_MAX_OPT_LENGTH];
4426 	icmp_stack_t		*is = icmp->icmp_is;
4427 	conn_t			*connp = icmp->icmp_connp;
4428 	cred_t			*cr;
4429 
4430 	/*
4431 	 * All Solaris components should pass a db_credp
4432 	 * for this message, hence we ASSERT.
4433 	 * On production kernels we return an error to be robust against
4434 	 * random streams modules sitting on top of us.
4435 	 */
4436 	cr = msg_getcred(mp, NULL);
4437 	ASSERT(cr != NULL);
4438 	if (cr == NULL)
4439 		return (EINVAL);
4440 
4441 	err = tsol_compute_label(cr, dst,
4442 	    opt_storage, connp->conn_mac_exempt,
4443 	    is->is_netstack->netstack_ip);
4444 	if (err == 0) {
4445 		err = tsol_update_options(&icmp->icmp_ip_snd_options,
4446 		    &icmp->icmp_ip_snd_options_len, &icmp->icmp_label_len,
4447 		    opt_storage);
4448 	}
4449 	if (err != 0) {
4450 		BUMP_MIB(&is->is_rawip_mib, rawipOutErrors);
4451 		DTRACE_PROBE4(
4452 		    tx__ip__log__drop__updatelabel__icmp,
4453 		    char *, "icmp(1) failed to update options(2) on mp(3)",
4454 		    icmp_t *, icmp, char *, opt_storage, mblk_t *, mp);
4455 		return (err);
4456 	}
4457 	IN6_IPADDR_TO_V4MAPPED(dst, &icmp->icmp_v6lastdst);
4458 	return (0);
4459 }
4460 
4461 /*
4462  * This routine handles all messages passed downstream.  It either
4463  * consumes the message or passes it downstream; it never queues a
4464  * a message.
4465  */
4466 static void
4467 icmp_wput(queue_t *q, mblk_t *mp)
4468 {
4469 	uchar_t	*rptr = mp->b_rptr;
4470 	ipha_t	*ipha;
4471 	mblk_t	*mp1;
4472 #define	tudr ((struct T_unitdata_req *)rptr)
4473 	size_t	ip_len;
4474 	conn_t	*connp = Q_TO_CONN(q);
4475 	icmp_t	*icmp = connp->conn_icmp;
4476 	icmp_stack_t *is = icmp->icmp_is;
4477 	sin6_t	*sin6;
4478 	sin_t	*sin;
4479 	ipaddr_t	v4dst;
4480 	ip4_pkt_t	pktinfo;
4481 	ip4_pkt_t	*pktinfop = &pktinfo;
4482 	ip6_pkt_t	ipp_s;  /* For ancillary data options */
4483 	ip6_pkt_t	*ipp = &ipp_s;
4484 	int error;
4485 
4486 	ipp->ipp_fields = 0;
4487 	ipp->ipp_sticky_ignored = 0;
4488 
4489 	switch (mp->b_datap->db_type) {
4490 	case M_DATA:
4491 		if (icmp->icmp_hdrincl) {
4492 			ASSERT(icmp->icmp_ipversion == IPV4_VERSION);
4493 			ipha = (ipha_t *)mp->b_rptr;
4494 			if (mp->b_wptr - mp->b_rptr < IP_SIMPLE_HDR_LENGTH) {
4495 				if (!pullupmsg(mp, IP_SIMPLE_HDR_LENGTH)) {
4496 					BUMP_MIB(&is->is_rawip_mib,
4497 					    rawipOutErrors);
4498 					freemsg(mp);
4499 					return;
4500 				}
4501 				ipha = (ipha_t *)mp->b_rptr;
4502 			}
4503 			/*
4504 			 * If this connection was used for v6 (inconceivable!)
4505 			 * or if we have a new destination, then it's time to
4506 			 * figure a new label.
4507 			 */
4508 			if (is_system_labeled() &&
4509 			    (!IN6_IS_ADDR_V4MAPPED(&icmp->icmp_v6lastdst) ||
4510 			    V4_PART_OF_V6(icmp->icmp_v6lastdst) !=
4511 			    ipha->ipha_dst)) {
4512 				error = icmp_update_label(icmp, mp,
4513 				    ipha->ipha_dst);
4514 				if (error != 0) {
4515 					icmp_ud_err(q, mp, error);
4516 					return;
4517 				}
4518 			}
4519 			error = icmp_wput_hdrincl(q, connp, mp, icmp, NULL);
4520 			if (error != 0)
4521 				icmp_ud_err(q, mp, error);
4522 			return;
4523 		}
4524 		freemsg(mp);
4525 		return;
4526 	case M_PROTO:
4527 	case M_PCPROTO:
4528 		ip_len = mp->b_wptr - rptr;
4529 		if (ip_len >= sizeof (struct T_unitdata_req)) {
4530 			/* Expedite valid T_UNITDATA_REQ to below the switch */
4531 			if (((union T_primitives *)rptr)->type
4532 			    == T_UNITDATA_REQ)
4533 				break;
4534 		}
4535 		/* FALLTHRU */
4536 	default:
4537 		icmp_wput_other(q, mp);
4538 		return;
4539 	}
4540 
4541 	/* Handle T_UNITDATA_REQ messages here. */
4542 
4543 	mp1 = mp->b_cont;
4544 	if (mp1 == NULL) {
4545 		BUMP_MIB(&is->is_rawip_mib, rawipOutErrors);
4546 		icmp_ud_err(q, mp, EPROTO);
4547 		return;
4548 	}
4549 
4550 	if ((rptr + tudr->DEST_offset + tudr->DEST_length) > mp->b_wptr) {
4551 		BUMP_MIB(&is->is_rawip_mib, rawipOutErrors);
4552 		icmp_ud_err(q, mp, EADDRNOTAVAIL);
4553 		return;
4554 	}
4555 
4556 	switch (icmp->icmp_family) {
4557 	case AF_INET6:
4558 		sin6 = (sin6_t *)&rptr[tudr->DEST_offset];
4559 		if (!OK_32PTR((char *)sin6) ||
4560 		    tudr->DEST_length != sizeof (sin6_t) ||
4561 		    sin6->sin6_family != AF_INET6) {
4562 			BUMP_MIB(&is->is_rawip_mib, rawipOutErrors);
4563 			icmp_ud_err(q, mp, EADDRNOTAVAIL);
4564 			return;
4565 		}
4566 
4567 		/* No support for mapped addresses on raw sockets */
4568 		if (IN6_IS_ADDR_V4MAPPED(&sin6->sin6_addr)) {
4569 			BUMP_MIB(&is->is_rawip_mib, rawipOutErrors);
4570 			icmp_ud_err(q, mp, EADDRNOTAVAIL);
4571 			return;
4572 		}
4573 
4574 		/*
4575 		 * Destination is a native IPv6 address.
4576 		 * Send out an IPv6 format packet.
4577 		 */
4578 		if (tudr->OPT_length != 0) {
4579 			int error;
4580 
4581 			error = 0;
4582 			if (icmp_unitdata_opt_process(q, mp, &error,
4583 			    (void *)ipp) < 0) {
4584 				/* failure */
4585 				BUMP_MIB(&is->is_rawip_mib, rawipOutErrors);
4586 				icmp_ud_err(q, mp, error);
4587 				return;
4588 			}
4589 			ASSERT(error == 0);
4590 		}
4591 
4592 		error = raw_ip_send_data_v6(q, connp, mp1, sin6, ipp);
4593 		goto done;
4594 
4595 	case AF_INET:
4596 		sin = (sin_t *)&rptr[tudr->DEST_offset];
4597 		if (!OK_32PTR((char *)sin) ||
4598 		    tudr->DEST_length != sizeof (sin_t) ||
4599 		    sin->sin_family != AF_INET) {
4600 			BUMP_MIB(&is->is_rawip_mib, rawipOutErrors);
4601 			icmp_ud_err(q, mp, EADDRNOTAVAIL);
4602 			return;
4603 		}
4604 		/* Extract and ipaddr */
4605 		v4dst = sin->sin_addr.s_addr;
4606 		break;
4607 
4608 	default:
4609 		ASSERT(0);
4610 	}
4611 
4612 	pktinfop->ip4_ill_index = 0;
4613 	pktinfop->ip4_addr = INADDR_ANY;
4614 
4615 	/*
4616 	 * If options passed in, feed it for verification and handling
4617 	 */
4618 	if (tudr->OPT_length != 0) {
4619 		int error;
4620 
4621 		error = 0;
4622 		if (icmp_unitdata_opt_process(q, mp, &error,
4623 		    (void *)pktinfop) < 0) {
4624 			/* failure */
4625 			BUMP_MIB(&is->is_rawip_mib, rawipOutErrors);
4626 			icmp_ud_err(q, mp, error);
4627 			return;
4628 		}
4629 		ASSERT(error == 0);
4630 		/*
4631 		 * Note: Success in processing options.
4632 		 * mp option buffer represented by
4633 		 * OPT_length/offset now potentially modified
4634 		 * and contain option setting results
4635 		 */
4636 	}
4637 
4638 	error = raw_ip_send_data_v4(q, connp, mp1, v4dst, pktinfop);
4639 done:
4640 	if (error != 0) {
4641 		icmp_ud_err(q, mp, error);
4642 		return;
4643 	} else {
4644 		mp->b_cont = NULL;
4645 		freeb(mp);
4646 	}
4647 }
4648 
4649 
4650 /* ARGSUSED */
4651 static void
4652 icmp_wput_fallback(queue_t *q, mblk_t *mp)
4653 {
4654 #ifdef DEBUG
4655 	cmn_err(CE_CONT, "icmp_wput_fallback: Message during fallback \n");
4656 #endif
4657 	freemsg(mp);
4658 }
4659 
4660 static int
4661 raw_ip_send_data_v4(queue_t *q, conn_t *connp, mblk_t *mp, ipaddr_t v4dst,
4662     ip4_pkt_t *pktinfop)
4663 {
4664 	ipha_t	*ipha;
4665 	size_t	ip_len;
4666 	icmp_t	*icmp = connp->conn_icmp;
4667 	icmp_stack_t *is = icmp->icmp_is;
4668 	int	ip_hdr_length;
4669 	ip_opt_info_t	optinfo;
4670 
4671 	optinfo.ip_opt_flags = 0;
4672 	optinfo.ip_opt_ill_index = 0;
4673 
4674 	if (icmp->icmp_state == TS_UNBND) {
4675 		/* If a port has not been bound to the stream, fail. */
4676 		BUMP_MIB(&is->is_rawip_mib, rawipOutErrors);
4677 		return (EPROTO);
4678 	}
4679 
4680 	if (v4dst == INADDR_ANY)
4681 		v4dst = htonl(INADDR_LOOPBACK);
4682 
4683 	/* Check if our saved options are valid; update if not */
4684 	if (is_system_labeled() &&
4685 	    (!IN6_IS_ADDR_V4MAPPED(&icmp->icmp_v6lastdst) ||
4686 	    V4_PART_OF_V6(icmp->icmp_v6lastdst) != v4dst)) {
4687 		int error = icmp_update_label(icmp, mp, v4dst);
4688 
4689 		if (error != 0)
4690 			return (error);
4691 	}
4692 
4693 	/* Protocol 255 contains full IP headers */
4694 	if (icmp->icmp_hdrincl)
4695 		return (icmp_wput_hdrincl(q, connp, mp, icmp, pktinfop));
4696 
4697 	/* Add an IP header */
4698 	ip_hdr_length = IP_SIMPLE_HDR_LENGTH + icmp->icmp_ip_snd_options_len;
4699 	ipha = (ipha_t *)&mp->b_rptr[-ip_hdr_length];
4700 	if ((uchar_t *)ipha < mp->b_datap->db_base ||
4701 	    mp->b_datap->db_ref != 1 ||
4702 	    !OK_32PTR(ipha)) {
4703 		mblk_t	*mp1;
4704 		if (!(mp1 = allocb(ip_hdr_length + is->is_wroff_extra,
4705 		    BPRI_LO))) {
4706 			BUMP_MIB(&is->is_rawip_mib, rawipOutErrors);
4707 			return (ENOMEM);
4708 		}
4709 		mp1->b_cont = mp;
4710 		ipha = (ipha_t *)mp1->b_datap->db_lim;
4711 		mp1->b_wptr = (uchar_t *)ipha;
4712 		ipha = (ipha_t *)((uchar_t *)ipha - ip_hdr_length);
4713 		mp = mp1;
4714 	}
4715 #ifdef	_BIG_ENDIAN
4716 	/* Set version, header length, and tos */
4717 	*(uint16_t *)&ipha->ipha_version_and_hdr_length =
4718 	    ((((IP_VERSION << 4) | (ip_hdr_length>>2)) << 8) |
4719 	    icmp->icmp_type_of_service);
4720 	/* Set ttl and protocol */
4721 	*(uint16_t *)&ipha->ipha_ttl = (icmp->icmp_ttl << 8) | icmp->icmp_proto;
4722 #else
4723 	/* Set version, header length, and tos */
4724 	*(uint16_t *)&ipha->ipha_version_and_hdr_length =
4725 	    ((icmp->icmp_type_of_service << 8) |
4726 	    ((IP_VERSION << 4) | (ip_hdr_length>>2)));
4727 	/* Set ttl and protocol */
4728 	*(uint16_t *)&ipha->ipha_ttl = (icmp->icmp_proto << 8) | icmp->icmp_ttl;
4729 #endif
4730 	if (pktinfop->ip4_addr != INADDR_ANY) {
4731 		ipha->ipha_src = pktinfop->ip4_addr;
4732 		optinfo.ip_opt_flags = IP_VERIFY_SRC;
4733 	} else {
4734 
4735 		/*
4736 		 * Copy our address into the packet.  If this is zero,
4737 		 * ip will fill in the real source address.
4738 		 */
4739 		IN6_V4MAPPED_TO_IPADDR(&icmp->icmp_v6src, ipha->ipha_src);
4740 	}
4741 
4742 	ipha->ipha_fragment_offset_and_flags = 0;
4743 
4744 	if (pktinfop->ip4_ill_index != 0) {
4745 		optinfo.ip_opt_ill_index = pktinfop->ip4_ill_index;
4746 	}
4747 
4748 
4749 	/*
4750 	 * For the socket of SOCK_RAW type, the checksum is provided in the
4751 	 * pre-built packet. We set the ipha_ident field to IP_HDR_INCLUDED to
4752 	 * tell IP that the application has sent a complete IP header and not
4753 	 * to compute the transport checksum nor change the DF flag.
4754 	 */
4755 	ipha->ipha_ident = IP_HDR_INCLUDED;
4756 
4757 	/* Finish common formatting of the packet. */
4758 	mp->b_rptr = (uchar_t *)ipha;
4759 
4760 	ip_len = mp->b_wptr - (uchar_t *)ipha;
4761 	if (mp->b_cont != NULL)
4762 		ip_len += msgdsize(mp->b_cont);
4763 
4764 	/*
4765 	 * Set the length into the IP header.
4766 	 * If the length is greater than the maximum allowed by IP,
4767 	 * then free the message and return. Do not try and send it
4768 	 * as this can cause problems in layers below.
4769 	 */
4770 	if (ip_len > IP_MAXPACKET) {
4771 		BUMP_MIB(&is->is_rawip_mib, rawipOutErrors);
4772 		return (EMSGSIZE);
4773 	}
4774 	ipha->ipha_length = htons((uint16_t)ip_len);
4775 	/*
4776 	 * Copy in the destination address request
4777 	 */
4778 	ipha->ipha_dst = v4dst;
4779 
4780 	/*
4781 	 * Set ttl based on IP_MULTICAST_TTL to match IPv6 logic.
4782 	 */
4783 	if (CLASSD(v4dst))
4784 		ipha->ipha_ttl = icmp->icmp_multicast_ttl;
4785 
4786 	/* Copy in options if any */
4787 	if (ip_hdr_length > IP_SIMPLE_HDR_LENGTH) {
4788 		bcopy(icmp->icmp_ip_snd_options,
4789 		    &ipha[1], icmp->icmp_ip_snd_options_len);
4790 		/*
4791 		 * Massage source route putting first source route in ipha_dst.
4792 		 * Ignore the destination in the T_unitdata_req.
4793 		 */
4794 		(void) ip_massage_options(ipha, is->is_netstack);
4795 	}
4796 
4797 	BUMP_MIB(&is->is_rawip_mib, rawipOutDatagrams);
4798 	ip_output_options(connp, mp, q, IP_WPUT, &optinfo);
4799 	return (0);
4800 }
4801 
4802 static int
4803 icmp_update_label_v6(icmp_t *icmp, mblk_t *mp, in6_addr_t *dst)
4804 {
4805 	int err;
4806 	uchar_t opt_storage[TSOL_MAX_IPV6_OPTION];
4807 	icmp_stack_t		*is = icmp->icmp_is;
4808 	conn_t			*connp = icmp->icmp_connp;
4809 	cred_t			*cr;
4810 
4811 	/*
4812 	 * All Solaris components should pass a db_credp
4813 	 * for this message, hence we ASSERT.
4814 	 * On production kernels we return an error to be robust against
4815 	 * random streams modules sitting on top of us.
4816 	 */
4817 	cr = msg_getcred(mp, NULL);
4818 	ASSERT(cr != NULL);
4819 	if (cr == NULL)
4820 		return (EINVAL);
4821 
4822 	err = tsol_compute_label_v6(cr, dst,
4823 	    opt_storage, connp->conn_mac_exempt,
4824 	    is->is_netstack->netstack_ip);
4825 	if (err == 0) {
4826 		err = tsol_update_sticky(&icmp->icmp_sticky_ipp,
4827 		    &icmp->icmp_label_len_v6, opt_storage);
4828 	}
4829 	if (err != 0) {
4830 		BUMP_MIB(&is->is_rawip_mib, rawipOutErrors);
4831 		DTRACE_PROBE4(
4832 		    tx__ip__log__drop__updatelabel__icmp6,
4833 		    char *, "icmp(1) failed to update options(2) on mp(3)",
4834 		    icmp_t *, icmp, char *, opt_storage, mblk_t *, mp);
4835 		return (err);
4836 	}
4837 
4838 	icmp->icmp_v6lastdst = *dst;
4839 	return (0);
4840 }
4841 
4842 /*
4843  * raw_ip_send_data_v6():
4844  * Assumes that icmp_wput did some sanity checking on the destination
4845  * address, but that the label may not yet be correct.
4846  */
4847 static int
4848 raw_ip_send_data_v6(queue_t *q, conn_t *connp, mblk_t *mp, sin6_t *sin6,
4849     ip6_pkt_t *ipp)
4850 {
4851 	ip6_t			*ip6h;
4852 	ip6i_t			*ip6i;	/* mp->b_rptr even if no ip6i_t */
4853 	int			ip_hdr_len = IPV6_HDR_LEN;
4854 	size_t			ip_len;
4855 	icmp_t			*icmp = connp->conn_icmp;
4856 	icmp_stack_t		*is = icmp->icmp_is;
4857 	ip6_pkt_t		*tipp;
4858 	uint32_t		csum = 0;
4859 	uint_t			ignore = 0;
4860 	uint_t			option_exists = 0, is_sticky = 0;
4861 	uint8_t			*cp;
4862 	uint8_t			*nxthdr_ptr;
4863 	in6_addr_t		ip6_dst;
4864 
4865 	/*
4866 	 * If the local address is a mapped address return
4867 	 * an error.
4868 	 * It would be possible to send an IPv6 packet but the
4869 	 * response would never make it back to the application
4870 	 * since it is bound to a mapped address.
4871 	 */
4872 	if (IN6_IS_ADDR_V4MAPPED(&icmp->icmp_v6src)) {
4873 		BUMP_MIB(&is->is_rawip_mib, rawipOutErrors);
4874 		return (EADDRNOTAVAIL);
4875 	}
4876 
4877 	ignore = ipp->ipp_sticky_ignored;
4878 	if (sin6->sin6_scope_id != 0 &&
4879 	    IN6_IS_ADDR_LINKSCOPE(&sin6->sin6_addr)) {
4880 		/*
4881 		 * IPPF_SCOPE_ID is special.  It's neither a sticky
4882 		 * option nor ancillary data.  It needs to be
4883 		 * explicitly set in options_exists.
4884 		 */
4885 		option_exists |= IPPF_SCOPE_ID;
4886 	}
4887 
4888 	/*
4889 	 * Compute the destination address
4890 	 */
4891 	ip6_dst = sin6->sin6_addr;
4892 	if (IN6_IS_ADDR_UNSPECIFIED(&sin6->sin6_addr))
4893 		ip6_dst = ipv6_loopback;
4894 
4895 	/*
4896 	 * If we're not going to the same destination as last time, then
4897 	 * recompute the label required.  This is done in a separate routine to
4898 	 * avoid blowing up our stack here.
4899 	 */
4900 	if (is_system_labeled() &&
4901 	    !IN6_ARE_ADDR_EQUAL(&icmp->icmp_v6lastdst, &ip6_dst)) {
4902 		int error = 0;
4903 
4904 		error = icmp_update_label_v6(icmp, mp, &ip6_dst);
4905 		if (error != 0)
4906 			return (error);
4907 	}
4908 
4909 	/*
4910 	 * If there's a security label here, then we ignore any options the
4911 	 * user may try to set.  We keep the peer's label as a hidden sticky
4912 	 * option.
4913 	 */
4914 	if (icmp->icmp_label_len_v6 > 0) {
4915 		ignore &= ~IPPF_HOPOPTS;
4916 		ipp->ipp_fields &= ~IPPF_HOPOPTS;
4917 	}
4918 
4919 	if ((icmp->icmp_sticky_ipp.ipp_fields == 0) &&
4920 	    (ipp->ipp_fields == 0)) {
4921 		/* No sticky options nor ancillary data. */
4922 		goto no_options;
4923 	}
4924 
4925 	/*
4926 	 * Go through the options figuring out where each is going to
4927 	 * come from and build two masks.  The first mask indicates if
4928 	 * the option exists at all.  The second mask indicates if the
4929 	 * option is sticky or ancillary.
4930 	 */
4931 	if (!(ignore & IPPF_HOPOPTS)) {
4932 		if (ipp->ipp_fields & IPPF_HOPOPTS) {
4933 			option_exists |= IPPF_HOPOPTS;
4934 			ip_hdr_len += ipp->ipp_hopoptslen;
4935 		} else if (icmp->icmp_sticky_ipp.ipp_fields & IPPF_HOPOPTS) {
4936 			option_exists |= IPPF_HOPOPTS;
4937 			is_sticky |= IPPF_HOPOPTS;
4938 			ip_hdr_len += icmp->icmp_sticky_ipp.ipp_hopoptslen;
4939 		}
4940 	}
4941 
4942 	if (!(ignore & IPPF_RTHDR)) {
4943 		if (ipp->ipp_fields & IPPF_RTHDR) {
4944 			option_exists |= IPPF_RTHDR;
4945 			ip_hdr_len += ipp->ipp_rthdrlen;
4946 		} else if (icmp->icmp_sticky_ipp.ipp_fields & IPPF_RTHDR) {
4947 			option_exists |= IPPF_RTHDR;
4948 			is_sticky |= IPPF_RTHDR;
4949 			ip_hdr_len += icmp->icmp_sticky_ipp.ipp_rthdrlen;
4950 		}
4951 	}
4952 
4953 	if (!(ignore & IPPF_RTDSTOPTS) && (option_exists & IPPF_RTHDR)) {
4954 		/*
4955 		 * Need to have a router header to use these.
4956 		 */
4957 		if (ipp->ipp_fields & IPPF_RTDSTOPTS) {
4958 			option_exists |= IPPF_RTDSTOPTS;
4959 			ip_hdr_len += ipp->ipp_rtdstoptslen;
4960 		} else if (icmp->icmp_sticky_ipp.ipp_fields & IPPF_RTDSTOPTS) {
4961 			option_exists |= IPPF_RTDSTOPTS;
4962 			is_sticky |= IPPF_RTDSTOPTS;
4963 			ip_hdr_len +=
4964 			    icmp->icmp_sticky_ipp.ipp_rtdstoptslen;
4965 		}
4966 	}
4967 
4968 	if (!(ignore & IPPF_DSTOPTS)) {
4969 		if (ipp->ipp_fields & IPPF_DSTOPTS) {
4970 			option_exists |= IPPF_DSTOPTS;
4971 			ip_hdr_len += ipp->ipp_dstoptslen;
4972 		} else if (icmp->icmp_sticky_ipp.ipp_fields & IPPF_DSTOPTS) {
4973 			option_exists |= IPPF_DSTOPTS;
4974 			is_sticky |= IPPF_DSTOPTS;
4975 			ip_hdr_len += icmp->icmp_sticky_ipp.ipp_dstoptslen;
4976 		}
4977 	}
4978 
4979 	if (!(ignore & IPPF_IFINDEX)) {
4980 		if (ipp->ipp_fields & IPPF_IFINDEX) {
4981 			option_exists |= IPPF_IFINDEX;
4982 		} else if (icmp->icmp_sticky_ipp.ipp_fields & IPPF_IFINDEX) {
4983 			option_exists |= IPPF_IFINDEX;
4984 			is_sticky |= IPPF_IFINDEX;
4985 		}
4986 	}
4987 
4988 	if (!(ignore & IPPF_ADDR)) {
4989 		if (ipp->ipp_fields & IPPF_ADDR) {
4990 			option_exists |= IPPF_ADDR;
4991 		} else if (icmp->icmp_sticky_ipp.ipp_fields & IPPF_ADDR) {
4992 			option_exists |= IPPF_ADDR;
4993 			is_sticky |= IPPF_ADDR;
4994 		}
4995 	}
4996 
4997 	if (!(ignore & IPPF_DONTFRAG)) {
4998 		if (ipp->ipp_fields & IPPF_DONTFRAG) {
4999 			option_exists |= IPPF_DONTFRAG;
5000 		} else if (icmp->icmp_sticky_ipp.ipp_fields & IPPF_DONTFRAG) {
5001 			option_exists |= IPPF_DONTFRAG;
5002 			is_sticky |= IPPF_DONTFRAG;
5003 		}
5004 	}
5005 
5006 	if (!(ignore & IPPF_USE_MIN_MTU)) {
5007 		if (ipp->ipp_fields & IPPF_USE_MIN_MTU) {
5008 			option_exists |= IPPF_USE_MIN_MTU;
5009 		} else if (icmp->icmp_sticky_ipp.ipp_fields &
5010 		    IPPF_USE_MIN_MTU) {
5011 			option_exists |= IPPF_USE_MIN_MTU;
5012 			is_sticky |= IPPF_USE_MIN_MTU;
5013 		}
5014 	}
5015 
5016 	if (!(ignore & IPPF_NEXTHOP)) {
5017 		if (ipp->ipp_fields & IPPF_NEXTHOP) {
5018 			option_exists |= IPPF_NEXTHOP;
5019 		} else if (icmp->icmp_sticky_ipp.ipp_fields & IPPF_NEXTHOP) {
5020 			option_exists |= IPPF_NEXTHOP;
5021 			is_sticky |= IPPF_NEXTHOP;
5022 		}
5023 	}
5024 
5025 	if (!(ignore & IPPF_HOPLIMIT) && (ipp->ipp_fields & IPPF_HOPLIMIT))
5026 		option_exists |= IPPF_HOPLIMIT;
5027 	/* IPV6_HOPLIMIT can never be sticky */
5028 	ASSERT(!(icmp->icmp_sticky_ipp.ipp_fields & IPPF_HOPLIMIT));
5029 
5030 	if (!(ignore & IPPF_UNICAST_HOPS) &&
5031 	    (icmp->icmp_sticky_ipp.ipp_fields & IPPF_UNICAST_HOPS)) {
5032 		option_exists |= IPPF_UNICAST_HOPS;
5033 		is_sticky |= IPPF_UNICAST_HOPS;
5034 	}
5035 
5036 	if (!(ignore & IPPF_MULTICAST_HOPS) &&
5037 	    (icmp->icmp_sticky_ipp.ipp_fields & IPPF_MULTICAST_HOPS)) {
5038 		option_exists |= IPPF_MULTICAST_HOPS;
5039 		is_sticky |= IPPF_MULTICAST_HOPS;
5040 	}
5041 
5042 	if (icmp->icmp_sticky_ipp.ipp_fields & IPPF_NO_CKSUM) {
5043 		/* This is a sticky socket option only */
5044 		option_exists |= IPPF_NO_CKSUM;
5045 		is_sticky |= IPPF_NO_CKSUM;
5046 	}
5047 
5048 	if (icmp->icmp_sticky_ipp.ipp_fields & IPPF_RAW_CKSUM) {
5049 		/* This is a sticky socket option only */
5050 		option_exists |= IPPF_RAW_CKSUM;
5051 		is_sticky |= IPPF_RAW_CKSUM;
5052 	}
5053 
5054 	if (!(ignore & IPPF_TCLASS)) {
5055 		if (ipp->ipp_fields & IPPF_TCLASS) {
5056 			option_exists |= IPPF_TCLASS;
5057 		} else if (icmp->icmp_sticky_ipp.ipp_fields & IPPF_TCLASS) {
5058 			option_exists |= IPPF_TCLASS;
5059 			is_sticky |= IPPF_TCLASS;
5060 		}
5061 	}
5062 
5063 no_options:
5064 
5065 	/*
5066 	 * If any options carried in the ip6i_t were specified, we
5067 	 * need to account for the ip6i_t in the data we'll be sending
5068 	 * down.
5069 	 */
5070 	if (option_exists & IPPF_HAS_IP6I)
5071 		ip_hdr_len += sizeof (ip6i_t);
5072 
5073 	/* check/fix buffer config, setup pointers into it */
5074 	ip6h = (ip6_t *)&mp->b_rptr[-ip_hdr_len];
5075 	if ((mp->b_datap->db_ref != 1) ||
5076 	    ((unsigned char *)ip6h < mp->b_datap->db_base) ||
5077 	    !OK_32PTR(ip6h)) {
5078 		mblk_t	*mp1;
5079 
5080 		/* Try to get everything in a single mblk next time */
5081 		if (ip_hdr_len > icmp->icmp_max_hdr_len) {
5082 			icmp->icmp_max_hdr_len = ip_hdr_len;
5083 
5084 			(void) proto_set_tx_wroff(q == NULL ? NULL:RD(q), connp,
5085 			    icmp->icmp_max_hdr_len + is->is_wroff_extra);
5086 		}
5087 		mp1 = allocb(ip_hdr_len + is->is_wroff_extra, BPRI_LO);
5088 		if (!mp1) {
5089 			BUMP_MIB(&is->is_rawip_mib, rawipOutErrors);
5090 			return (ENOMEM);
5091 		}
5092 		mp1->b_cont = mp;
5093 		mp1->b_wptr = mp1->b_datap->db_lim;
5094 		ip6h = (ip6_t *)(mp1->b_wptr - ip_hdr_len);
5095 		mp = mp1;
5096 	}
5097 	mp->b_rptr = (unsigned char *)ip6h;
5098 	ip6i = (ip6i_t *)ip6h;
5099 
5100 #define	ANCIL_OR_STICKY_PTR(f) ((is_sticky & f) ? &icmp->icmp_sticky_ipp : ipp)
5101 	if (option_exists & IPPF_HAS_IP6I) {
5102 		ip6h = (ip6_t *)&ip6i[1];
5103 		ip6i->ip6i_flags = 0;
5104 		ip6i->ip6i_vcf = IPV6_DEFAULT_VERS_AND_FLOW;
5105 
5106 		/* sin6_scope_id takes precendence over IPPF_IFINDEX */
5107 		if (option_exists & IPPF_SCOPE_ID) {
5108 			ip6i->ip6i_flags |= IP6I_IFINDEX;
5109 			ip6i->ip6i_ifindex = sin6->sin6_scope_id;
5110 		} else if (option_exists & IPPF_IFINDEX) {
5111 			tipp = ANCIL_OR_STICKY_PTR(IPPF_IFINDEX);
5112 			ASSERT(tipp->ipp_ifindex != 0);
5113 			ip6i->ip6i_flags |= IP6I_IFINDEX;
5114 			ip6i->ip6i_ifindex = tipp->ipp_ifindex;
5115 		}
5116 
5117 		if (option_exists & IPPF_RAW_CKSUM) {
5118 			ip6i->ip6i_flags |= IP6I_RAW_CHECKSUM;
5119 			ip6i->ip6i_checksum_off = icmp->icmp_checksum_off;
5120 		}
5121 
5122 		if (option_exists & IPPF_NO_CKSUM) {
5123 			ip6i->ip6i_flags |= IP6I_NO_ULP_CKSUM;
5124 		}
5125 
5126 		if (option_exists & IPPF_ADDR) {
5127 			/*
5128 			 * Enable per-packet source address verification if
5129 			 * IPV6_PKTINFO specified the source address.
5130 			 * ip6_src is set in the transport's _wput function.
5131 			 */
5132 			ip6i->ip6i_flags |= IP6I_VERIFY_SRC;
5133 		}
5134 
5135 		if (option_exists & IPPF_DONTFRAG) {
5136 			ip6i->ip6i_flags |= IP6I_DONTFRAG;
5137 		}
5138 
5139 		if (option_exists & IPPF_USE_MIN_MTU) {
5140 			ip6i->ip6i_flags = IP6I_API_USE_MIN_MTU(
5141 			    ip6i->ip6i_flags, ipp->ipp_use_min_mtu);
5142 		}
5143 
5144 		if (option_exists & IPPF_NEXTHOP) {
5145 			tipp = ANCIL_OR_STICKY_PTR(IPPF_NEXTHOP);
5146 			ASSERT(!IN6_IS_ADDR_UNSPECIFIED(&tipp->ipp_nexthop));
5147 			ip6i->ip6i_flags |= IP6I_NEXTHOP;
5148 			ip6i->ip6i_nexthop = tipp->ipp_nexthop;
5149 		}
5150 
5151 		/*
5152 		 * tell IP this is an ip6i_t private header
5153 		 */
5154 		ip6i->ip6i_nxt = IPPROTO_RAW;
5155 	}
5156 
5157 	/* Initialize IPv6 header */
5158 	ip6h->ip6_vcf = IPV6_DEFAULT_VERS_AND_FLOW;
5159 	bzero(&ip6h->ip6_src, sizeof (ip6h->ip6_src));
5160 
5161 	/* Set the hoplimit of the outgoing packet. */
5162 	if (option_exists & IPPF_HOPLIMIT) {
5163 		/* IPV6_HOPLIMIT ancillary data overrides all other settings. */
5164 		ip6h->ip6_hops = ipp->ipp_hoplimit;
5165 		ip6i->ip6i_flags |= IP6I_HOPLIMIT;
5166 	} else if (IN6_IS_ADDR_MULTICAST(&sin6->sin6_addr)) {
5167 		ip6h->ip6_hops = icmp->icmp_multicast_ttl;
5168 		if (option_exists & IPPF_MULTICAST_HOPS)
5169 			ip6i->ip6i_flags |= IP6I_HOPLIMIT;
5170 	} else {
5171 		ip6h->ip6_hops = icmp->icmp_ttl;
5172 		if (option_exists & IPPF_UNICAST_HOPS)
5173 			ip6i->ip6i_flags |= IP6I_HOPLIMIT;
5174 	}
5175 
5176 	if (option_exists & IPPF_ADDR) {
5177 		tipp = ANCIL_OR_STICKY_PTR(IPPF_ADDR);
5178 		ASSERT(!IN6_IS_ADDR_UNSPECIFIED(&tipp->ipp_addr));
5179 		ip6h->ip6_src = tipp->ipp_addr;
5180 	} else {
5181 		/*
5182 		 * The source address was not set using IPV6_PKTINFO.
5183 		 * First look at the bound source.
5184 		 * If unspecified fallback to __sin6_src_id.
5185 		 */
5186 		ip6h->ip6_src = icmp->icmp_v6src;
5187 		if (sin6->__sin6_src_id != 0 &&
5188 		    IN6_IS_ADDR_UNSPECIFIED(&ip6h->ip6_src)) {
5189 			ip_srcid_find_id(sin6->__sin6_src_id,
5190 			    &ip6h->ip6_src, icmp->icmp_zoneid,
5191 			    is->is_netstack);
5192 		}
5193 	}
5194 
5195 	nxthdr_ptr = (uint8_t *)&ip6h->ip6_nxt;
5196 	cp = (uint8_t *)&ip6h[1];
5197 
5198 	/*
5199 	 * Here's where we have to start stringing together
5200 	 * any extension headers in the right order:
5201 	 * Hop-by-hop, destination, routing, and final destination opts.
5202 	 */
5203 	if (option_exists & IPPF_HOPOPTS) {
5204 		/* Hop-by-hop options */
5205 		ip6_hbh_t *hbh = (ip6_hbh_t *)cp;
5206 		tipp = ANCIL_OR_STICKY_PTR(IPPF_HOPOPTS);
5207 
5208 		*nxthdr_ptr = IPPROTO_HOPOPTS;
5209 		nxthdr_ptr = &hbh->ip6h_nxt;
5210 
5211 		bcopy(tipp->ipp_hopopts, cp, tipp->ipp_hopoptslen);
5212 		cp += tipp->ipp_hopoptslen;
5213 	}
5214 	/*
5215 	 * En-route destination options
5216 	 * Only do them if there's a routing header as well
5217 	 */
5218 	if (option_exists & IPPF_RTDSTOPTS) {
5219 		ip6_dest_t *dst = (ip6_dest_t *)cp;
5220 		tipp = ANCIL_OR_STICKY_PTR(IPPF_RTDSTOPTS);
5221 
5222 		*nxthdr_ptr = IPPROTO_DSTOPTS;
5223 		nxthdr_ptr = &dst->ip6d_nxt;
5224 
5225 		bcopy(tipp->ipp_rtdstopts, cp, tipp->ipp_rtdstoptslen);
5226 		cp += tipp->ipp_rtdstoptslen;
5227 	}
5228 	/*
5229 	 * Routing header next
5230 	 */
5231 	if (option_exists & IPPF_RTHDR) {
5232 		ip6_rthdr_t *rt = (ip6_rthdr_t *)cp;
5233 		tipp = ANCIL_OR_STICKY_PTR(IPPF_RTHDR);
5234 
5235 		*nxthdr_ptr = IPPROTO_ROUTING;
5236 		nxthdr_ptr = &rt->ip6r_nxt;
5237 
5238 		bcopy(tipp->ipp_rthdr, cp, tipp->ipp_rthdrlen);
5239 		cp += tipp->ipp_rthdrlen;
5240 	}
5241 	/*
5242 	 * Do ultimate destination options
5243 	 */
5244 	if (option_exists & IPPF_DSTOPTS) {
5245 		ip6_dest_t *dest = (ip6_dest_t *)cp;
5246 		tipp = ANCIL_OR_STICKY_PTR(IPPF_DSTOPTS);
5247 
5248 		*nxthdr_ptr = IPPROTO_DSTOPTS;
5249 		nxthdr_ptr = &dest->ip6d_nxt;
5250 
5251 		bcopy(tipp->ipp_dstopts, cp, tipp->ipp_dstoptslen);
5252 		cp += tipp->ipp_dstoptslen;
5253 	}
5254 
5255 	/*
5256 	 * Now set the last header pointer to the proto passed in
5257 	 */
5258 	ASSERT((int)(cp - (uint8_t *)ip6i) == ip_hdr_len);
5259 	*nxthdr_ptr = icmp->icmp_proto;
5260 
5261 	/*
5262 	 * Copy in the destination address
5263 	 */
5264 	ip6h->ip6_dst = ip6_dst;
5265 
5266 	ip6h->ip6_vcf =
5267 	    (IPV6_DEFAULT_VERS_AND_FLOW & IPV6_VERS_AND_FLOW_MASK) |
5268 	    (sin6->sin6_flowinfo & ~IPV6_VERS_AND_FLOW_MASK);
5269 
5270 	if (option_exists & IPPF_TCLASS) {
5271 		tipp = ANCIL_OR_STICKY_PTR(IPPF_TCLASS);
5272 		ip6h->ip6_vcf = IPV6_TCLASS_FLOW(ip6h->ip6_vcf,
5273 		    tipp->ipp_tclass);
5274 	}
5275 	if (option_exists & IPPF_RTHDR) {
5276 		ip6_rthdr_t	*rth;
5277 
5278 		/*
5279 		 * Perform any processing needed for source routing.
5280 		 * We know that all extension headers will be in the same mblk
5281 		 * as the IPv6 header.
5282 		 */
5283 		rth = ip_find_rthdr_v6(ip6h, mp->b_wptr);
5284 		if (rth != NULL && rth->ip6r_segleft != 0) {
5285 			if (rth->ip6r_type != IPV6_RTHDR_TYPE_0) {
5286 				/*
5287 				 * Drop packet - only support Type 0 routing.
5288 				 * Notify the application as well.
5289 				 */
5290 				BUMP_MIB(&is->is_rawip_mib,
5291 				    rawipOutErrors);
5292 				return (EPROTO);
5293 			}
5294 			/*
5295 			 * rth->ip6r_len is twice the number of
5296 			 * addresses in the header
5297 			 */
5298 			if (rth->ip6r_len & 0x1) {
5299 				BUMP_MIB(&is->is_rawip_mib,
5300 				    rawipOutErrors);
5301 				return (EPROTO);
5302 			}
5303 			/*
5304 			 * Shuffle the routing header and ip6_dst
5305 			 * addresses, and get the checksum difference
5306 			 * between the first hop (in ip6_dst) and
5307 			 * the destination (in the last routing hdr entry).
5308 			 */
5309 			csum = ip_massage_options_v6(ip6h, rth,
5310 			    is->is_netstack);
5311 			/*
5312 			 * Verify that the first hop isn't a mapped address.
5313 			 * Routers along the path need to do this verification
5314 			 * for subsequent hops.
5315 			 */
5316 			if (IN6_IS_ADDR_V4MAPPED(&ip6h->ip6_dst)) {
5317 				BUMP_MIB(&is->is_rawip_mib,
5318 				    rawipOutErrors);
5319 				return (EADDRNOTAVAIL);
5320 			}
5321 		}
5322 	}
5323 
5324 	ip_len = mp->b_wptr - (uchar_t *)ip6h - IPV6_HDR_LEN;
5325 	if (mp->b_cont != NULL)
5326 		ip_len += msgdsize(mp->b_cont);
5327 
5328 	/*
5329 	 * Set the length into the IP header.
5330 	 * If the length is greater than the maximum allowed by IP,
5331 	 * then free the message and return. Do not try and send it
5332 	 * as this can cause problems in layers below.
5333 	 */
5334 	if (ip_len > IP_MAXPACKET) {
5335 		BUMP_MIB(&is->is_rawip_mib, rawipOutErrors);
5336 		return (EMSGSIZE);
5337 	}
5338 	if (icmp->icmp_proto == IPPROTO_ICMPV6 || icmp->icmp_raw_checksum) {
5339 		uint_t	cksum_off;	/* From ip6i == mp->b_rptr */
5340 		uint16_t *cksum_ptr;
5341 		uint_t	ext_hdrs_len;
5342 
5343 		/* ICMPv6 must have an offset matching icmp6_cksum offset */
5344 		ASSERT(icmp->icmp_proto != IPPROTO_ICMPV6 ||
5345 		    icmp->icmp_checksum_off == 2);
5346 
5347 		/*
5348 		 * We make it easy for IP to include our pseudo header
5349 		 * by putting our length in uh_checksum, modified (if
5350 		 * we have a routing header) by the checksum difference
5351 		 * between the ultimate destination and first hop addresses.
5352 		 * Note: ICMPv6 must always checksum the packet.
5353 		 */
5354 		cksum_off = ip_hdr_len + icmp->icmp_checksum_off;
5355 		if (cksum_off + sizeof (uint16_t) > mp->b_wptr - mp->b_rptr) {
5356 			if (!pullupmsg(mp, cksum_off + sizeof (uint16_t))) {
5357 				BUMP_MIB(&is->is_rawip_mib,
5358 				    rawipOutErrors);
5359 				freemsg(mp);
5360 				return (0);
5361 			}
5362 			ip6i = (ip6i_t *)mp->b_rptr;
5363 			if (ip6i->ip6i_nxt == IPPROTO_RAW)
5364 				ip6h = (ip6_t *)&ip6i[1];
5365 			else
5366 				ip6h = (ip6_t *)ip6i;
5367 		}
5368 		/* Add payload length to checksum */
5369 		ext_hdrs_len = ip_hdr_len - IPV6_HDR_LEN -
5370 		    (int)((uchar_t *)ip6h - (uchar_t *)ip6i);
5371 		csum += htons(ip_len - ext_hdrs_len);
5372 
5373 		cksum_ptr = (uint16_t *)((uchar_t *)ip6i + cksum_off);
5374 		csum = (csum & 0xFFFF) + (csum >> 16);
5375 		*cksum_ptr = (uint16_t)csum;
5376 	}
5377 
5378 #ifdef _LITTLE_ENDIAN
5379 	ip_len = htons(ip_len);
5380 #endif
5381 	ip6h->ip6_plen = (uint16_t)ip_len;
5382 
5383 	/* We're done. Pass the packet to IP */
5384 	BUMP_MIB(&is->is_rawip_mib, rawipOutDatagrams);
5385 	ip_output_v6(icmp->icmp_connp, mp, q, IP_WPUT);
5386 	return (0);
5387 }
5388 
5389 static void
5390 icmp_wput_other(queue_t *q, mblk_t *mp)
5391 {
5392 	uchar_t	*rptr = mp->b_rptr;
5393 	struct iocblk *iocp;
5394 #define	tudr ((struct T_unitdata_req *)rptr)
5395 	conn_t	*connp = Q_TO_CONN(q);
5396 	icmp_t	*icmp = connp->conn_icmp;
5397 	icmp_stack_t *is = icmp->icmp_is;
5398 	cred_t *cr;
5399 
5400 	switch (mp->b_datap->db_type) {
5401 	case M_PROTO:
5402 	case M_PCPROTO:
5403 		if (mp->b_wptr - rptr < sizeof (t_scalar_t)) {
5404 			/*
5405 			 * If the message does not contain a PRIM_type,
5406 			 * throw it away.
5407 			 */
5408 			freemsg(mp);
5409 			return;
5410 		}
5411 		switch (((union T_primitives *)rptr)->type) {
5412 		case T_ADDR_REQ:
5413 			icmp_addr_req(q, mp);
5414 			return;
5415 		case O_T_BIND_REQ:
5416 		case T_BIND_REQ:
5417 			icmp_tpi_bind(q, mp);
5418 			return;
5419 		case T_CONN_REQ:
5420 			icmp_tpi_connect(q, mp);
5421 			return;
5422 		case T_CAPABILITY_REQ:
5423 			icmp_capability_req(q, mp);
5424 			return;
5425 		case T_INFO_REQ:
5426 			icmp_info_req(q, mp);
5427 			return;
5428 		case T_UNITDATA_REQ:
5429 			/*
5430 			 * If a T_UNITDATA_REQ gets here, the address must
5431 			 * be bad.  Valid T_UNITDATA_REQs are found above
5432 			 * and break to below this switch.
5433 			 */
5434 			icmp_ud_err(q, mp, EADDRNOTAVAIL);
5435 			return;
5436 		case T_UNBIND_REQ:
5437 			icmp_tpi_unbind(q, mp);
5438 			return;
5439 
5440 		case T_SVR4_OPTMGMT_REQ:
5441 			/*
5442 			 * All Solaris components should pass a db_credp
5443 			 * for this TPI message, hence we ASSERT.
5444 			 * But in case there is some other M_PROTO that looks
5445 			 * like a TPI message sent by some other kernel
5446 			 * component, we check and return an error.
5447 			 */
5448 			cr = msg_getcred(mp, NULL);
5449 			ASSERT(cr != NULL);
5450 			if (cr == NULL) {
5451 				icmp_err_ack(q, mp, TSYSERR, EINVAL);
5452 				return;
5453 			}
5454 
5455 			if (!snmpcom_req(q, mp, icmp_snmp_set, ip_snmp_get,
5456 			    cr)) {
5457 				/* Only IP can return anything meaningful */
5458 				(void) svr4_optcom_req(q, mp, cr,
5459 				    &icmp_opt_obj, B_TRUE);
5460 			}
5461 			return;
5462 
5463 		case T_OPTMGMT_REQ:
5464 			/*
5465 			 * All Solaris components should pass a db_credp
5466 			 * for this TPI message, hence we ASSERT.
5467 			 * But in case there is some other M_PROTO that looks
5468 			 * like a TPI message sent by some other kernel
5469 			 * component, we check and return an error.
5470 			 */
5471 			cr = msg_getcred(mp, NULL);
5472 			ASSERT(cr != NULL);
5473 			if (cr == NULL) {
5474 				icmp_err_ack(q, mp, TSYSERR, EINVAL);
5475 				return;
5476 			}
5477 			/* Only IP can return anything meaningful */
5478 			(void) tpi_optcom_req(q, mp, cr, &icmp_opt_obj, B_TRUE);
5479 			return;
5480 
5481 		case T_DISCON_REQ:
5482 			icmp_tpi_disconnect(q, mp);
5483 			return;
5484 
5485 		/* The following TPI message is not supported by icmp. */
5486 		case O_T_CONN_RES:
5487 		case T_CONN_RES:
5488 			icmp_err_ack(q, mp, TNOTSUPPORT, 0);
5489 			return;
5490 
5491 		/* The following 3 TPI requests are illegal for icmp. */
5492 		case T_DATA_REQ:
5493 		case T_EXDATA_REQ:
5494 		case T_ORDREL_REQ:
5495 			freemsg(mp);
5496 			(void) putctl1(RD(q), M_ERROR, EPROTO);
5497 			return;
5498 		default:
5499 			break;
5500 		}
5501 		break;
5502 	case M_IOCTL:
5503 		iocp = (struct iocblk *)mp->b_rptr;
5504 		switch (iocp->ioc_cmd) {
5505 		case TI_GETPEERNAME:
5506 			if (icmp->icmp_state != TS_DATA_XFER) {
5507 				/*
5508 				 * If a default destination address has not
5509 				 * been associated with the stream, then we
5510 				 * don't know the peer's name.
5511 				 */
5512 				iocp->ioc_error = ENOTCONN;
5513 		err_ret:;
5514 				iocp->ioc_count = 0;
5515 				mp->b_datap->db_type = M_IOCACK;
5516 				qreply(q, mp);
5517 				return;
5518 			}
5519 			/* FALLTHRU */
5520 		case TI_GETMYNAME:
5521 			/*
5522 			 * For TI_GETPEERNAME and TI_GETMYNAME, we first
5523 			 * need to copyin the user's strbuf structure.
5524 			 * Processing will continue in the M_IOCDATA case
5525 			 * below.
5526 			 */
5527 			mi_copyin(q, mp, NULL,
5528 			    SIZEOF_STRUCT(strbuf, iocp->ioc_flag));
5529 			return;
5530 		case ND_SET:
5531 			/* nd_getset performs the necessary error checking */
5532 		case ND_GET:
5533 			if (nd_getset(q, is->is_nd, mp)) {
5534 				qreply(q, mp);
5535 				return;
5536 			}
5537 			break;
5538 		case _SIOCSOCKFALLBACK:
5539 			/*
5540 			 * socket is falling back to be a
5541 			 * streams socket. Nothing  to do
5542 			 */
5543 			iocp->ioc_count = 0;
5544 			iocp->ioc_rval = 0;
5545 			qreply(q, mp);
5546 			return;
5547 		default:
5548 			break;
5549 		}
5550 		break;
5551 	case M_IOCDATA:
5552 		icmp_wput_iocdata(q, mp);
5553 		return;
5554 	default:
5555 		break;
5556 	}
5557 	ip_wput(q, mp);
5558 }
5559 
5560 /*
5561  * icmp_wput_iocdata is called by icmp_wput_slow to handle all M_IOCDATA
5562  * messages.
5563  */
5564 static void
5565 icmp_wput_iocdata(queue_t *q, mblk_t *mp)
5566 {
5567 	mblk_t	*mp1;
5568 	STRUCT_HANDLE(strbuf, sb);
5569 	icmp_t	*icmp;
5570 	uint_t	addrlen;
5571 	uint_t	error;
5572 
5573 	/* Make sure it is one of ours. */
5574 	switch (((struct iocblk *)mp->b_rptr)->ioc_cmd) {
5575 	case TI_GETMYNAME:
5576 	case TI_GETPEERNAME:
5577 		break;
5578 	default:
5579 		icmp = Q_TO_ICMP(q);
5580 		ip_output(icmp->icmp_connp, mp, q, IP_WPUT);
5581 		return;
5582 	}
5583 	switch (mi_copy_state(q, mp, &mp1)) {
5584 	case -1:
5585 		return;
5586 	case MI_COPY_CASE(MI_COPY_IN, 1):
5587 		break;
5588 	case MI_COPY_CASE(MI_COPY_OUT, 1):
5589 		/*
5590 		 * The address has been copied out, so now
5591 		 * copyout the strbuf.
5592 		 */
5593 		mi_copyout(q, mp);
5594 		return;
5595 	case MI_COPY_CASE(MI_COPY_OUT, 2):
5596 		/*
5597 		 * The address and strbuf have been copied out.
5598 		 * We're done, so just acknowledge the original
5599 		 * M_IOCTL.
5600 		 */
5601 		mi_copy_done(q, mp, 0);
5602 		return;
5603 	default:
5604 		/*
5605 		 * Something strange has happened, so acknowledge
5606 		 * the original M_IOCTL with an EPROTO error.
5607 		 */
5608 		mi_copy_done(q, mp, EPROTO);
5609 		return;
5610 	}
5611 	/*
5612 	 * Now we have the strbuf structure for TI_GETMYNAME
5613 	 * and TI_GETPEERNAME.  Next we copyout the requested
5614 	 * address and then we'll copyout the strbuf.
5615 	 */
5616 	STRUCT_SET_HANDLE(sb, ((struct iocblk *)mp->b_rptr)->ioc_flag,
5617 	    (void *)mp1->b_rptr);
5618 	icmp = Q_TO_ICMP(q);
5619 	if (icmp->icmp_family == AF_INET)
5620 		addrlen = sizeof (sin_t);
5621 	else
5622 		addrlen = sizeof (sin6_t);
5623 
5624 	if (STRUCT_FGET(sb, maxlen) < addrlen) {
5625 		mi_copy_done(q, mp, EINVAL);
5626 		return;
5627 	}
5628 
5629 	mp1 = mi_copyout_alloc(q, mp, STRUCT_FGETP(sb, buf), addrlen, B_TRUE);
5630 
5631 	if (mp1 == NULL)
5632 		return;
5633 
5634 	rw_enter(&icmp->icmp_rwlock, RW_READER);
5635 	switch (((struct iocblk *)mp->b_rptr)->ioc_cmd) {
5636 	case TI_GETMYNAME:
5637 		error = rawip_do_getsockname(icmp, (void *)mp1->b_rptr,
5638 		    &addrlen);
5639 		break;
5640 	case TI_GETPEERNAME:
5641 		error = rawip_do_getpeername(icmp, (void *)mp1->b_rptr,
5642 		    &addrlen);
5643 		break;
5644 	}
5645 	rw_exit(&icmp->icmp_rwlock);
5646 
5647 	if (error != 0) {
5648 		mi_copy_done(q, mp, error);
5649 	} else {
5650 		mp1->b_wptr += addrlen;
5651 		STRUCT_FSET(sb, len, addrlen);
5652 
5653 		/* Copy out the address */
5654 		mi_copyout(q, mp);
5655 	}
5656 }
5657 
5658 static int
5659 icmp_unitdata_opt_process(queue_t *q, mblk_t *mp, int *errorp,
5660     void *thisdg_attrs)
5661 {
5662 	struct T_unitdata_req *udreqp;
5663 	int is_absreq_failure;
5664 	cred_t *cr;
5665 
5666 	udreqp = (struct T_unitdata_req *)mp->b_rptr;
5667 	*errorp = 0;
5668 
5669 	/*
5670 	 * All Solaris components should pass a db_credp
5671 	 * for this TPI message, hence we ASSERT.
5672 	 * But in case there is some other M_PROTO that looks
5673 	 * like a TPI message sent by some other kernel
5674 	 * component, we check and return an error.
5675 	 */
5676 	cr = msg_getcred(mp, NULL);
5677 	ASSERT(cr != NULL);
5678 	if (cr == NULL)
5679 		return (-1);
5680 
5681 	*errorp = tpi_optcom_buf(q, mp, &udreqp->OPT_length,
5682 	    udreqp->OPT_offset, cr, &icmp_opt_obj,
5683 	    thisdg_attrs, &is_absreq_failure);
5684 
5685 	if (*errorp != 0) {
5686 		/*
5687 		 * Note: No special action needed in this
5688 		 * module for "is_absreq_failure"
5689 		 */
5690 		return (-1);		/* failure */
5691 	}
5692 	ASSERT(is_absreq_failure == 0);
5693 	return (0);	/* success */
5694 }
5695 
5696 void
5697 icmp_ddi_g_init(void)
5698 {
5699 	icmp_max_optsize = optcom_max_optsize(icmp_opt_obj.odb_opt_des_arr,
5700 	    icmp_opt_obj.odb_opt_arr_cnt);
5701 
5702 	/*
5703 	 * We want to be informed each time a stack is created or
5704 	 * destroyed in the kernel, so we can maintain the
5705 	 * set of icmp_stack_t's.
5706 	 */
5707 	netstack_register(NS_ICMP, rawip_stack_init, NULL, rawip_stack_fini);
5708 }
5709 
5710 void
5711 icmp_ddi_g_destroy(void)
5712 {
5713 	netstack_unregister(NS_ICMP);
5714 }
5715 
5716 #define	INET_NAME	"ip"
5717 
5718 /*
5719  * Initialize the ICMP stack instance.
5720  */
5721 static void *
5722 rawip_stack_init(netstackid_t stackid, netstack_t *ns)
5723 {
5724 	icmp_stack_t	*is;
5725 	icmpparam_t	*pa;
5726 	int		error = 0;
5727 	major_t		major;
5728 
5729 	is = (icmp_stack_t *)kmem_zalloc(sizeof (*is), KM_SLEEP);
5730 	is->is_netstack = ns;
5731 
5732 	pa = (icmpparam_t *)kmem_alloc(sizeof (icmp_param_arr), KM_SLEEP);
5733 	is->is_param_arr = pa;
5734 	bcopy(icmp_param_arr, is->is_param_arr, sizeof (icmp_param_arr));
5735 
5736 	(void) icmp_param_register(&is->is_nd,
5737 	    is->is_param_arr, A_CNT(icmp_param_arr));
5738 	is->is_ksp = rawip_kstat_init(stackid);
5739 
5740 	major = mod_name_to_major(INET_NAME);
5741 	error = ldi_ident_from_major(major, &is->is_ldi_ident);
5742 	ASSERT(error == 0);
5743 	return (is);
5744 }
5745 
5746 /*
5747  * Free the ICMP stack instance.
5748  */
5749 static void
5750 rawip_stack_fini(netstackid_t stackid, void *arg)
5751 {
5752 	icmp_stack_t *is = (icmp_stack_t *)arg;
5753 
5754 	nd_free(&is->is_nd);
5755 	kmem_free(is->is_param_arr, sizeof (icmp_param_arr));
5756 	is->is_param_arr = NULL;
5757 
5758 	rawip_kstat_fini(stackid, is->is_ksp);
5759 	is->is_ksp = NULL;
5760 	ldi_ident_release(is->is_ldi_ident);
5761 	kmem_free(is, sizeof (*is));
5762 }
5763 
5764 static void *
5765 rawip_kstat_init(netstackid_t stackid) {
5766 	kstat_t	*ksp;
5767 
5768 	rawip_named_kstat_t template = {
5769 		{ "inDatagrams",	KSTAT_DATA_UINT32, 0 },
5770 		{ "inCksumErrs",	KSTAT_DATA_UINT32, 0 },
5771 		{ "inErrors",		KSTAT_DATA_UINT32, 0 },
5772 		{ "outDatagrams",	KSTAT_DATA_UINT32, 0 },
5773 		{ "outErrors",		KSTAT_DATA_UINT32, 0 },
5774 	};
5775 
5776 	ksp = kstat_create_netstack("icmp", 0, "rawip", "mib2",
5777 					KSTAT_TYPE_NAMED,
5778 					NUM_OF_FIELDS(rawip_named_kstat_t),
5779 					0, stackid);
5780 	if (ksp == NULL || ksp->ks_data == NULL)
5781 		return (NULL);
5782 
5783 	bcopy(&template, ksp->ks_data, sizeof (template));
5784 	ksp->ks_update = rawip_kstat_update;
5785 	ksp->ks_private = (void *)(uintptr_t)stackid;
5786 
5787 	kstat_install(ksp);
5788 	return (ksp);
5789 }
5790 
5791 static void
5792 rawip_kstat_fini(netstackid_t stackid, kstat_t *ksp)
5793 {
5794 	if (ksp != NULL) {
5795 		ASSERT(stackid == (netstackid_t)(uintptr_t)ksp->ks_private);
5796 		kstat_delete_netstack(ksp, stackid);
5797 	}
5798 }
5799 
5800 static int
5801 rawip_kstat_update(kstat_t *ksp, int rw)
5802 {
5803 	rawip_named_kstat_t *rawipkp;
5804 	netstackid_t	stackid = (netstackid_t)(uintptr_t)ksp->ks_private;
5805 	netstack_t	*ns;
5806 	icmp_stack_t	*is;
5807 
5808 	if ((ksp == NULL) || (ksp->ks_data == NULL))
5809 		return (EIO);
5810 
5811 	if (rw == KSTAT_WRITE)
5812 		return (EACCES);
5813 
5814 	rawipkp = (rawip_named_kstat_t *)ksp->ks_data;
5815 
5816 	ns = netstack_find_by_stackid(stackid);
5817 	if (ns == NULL)
5818 		return (-1);
5819 	is = ns->netstack_icmp;
5820 	if (is == NULL) {
5821 		netstack_rele(ns);
5822 		return (-1);
5823 	}
5824 	rawipkp->inDatagrams.value.ui32 =  is->is_rawip_mib.rawipInDatagrams;
5825 	rawipkp->inCksumErrs.value.ui32 =  is->is_rawip_mib.rawipInCksumErrs;
5826 	rawipkp->inErrors.value.ui32 =	   is->is_rawip_mib.rawipInErrors;
5827 	rawipkp->outDatagrams.value.ui32 = is->is_rawip_mib.rawipOutDatagrams;
5828 	rawipkp->outErrors.value.ui32 =	   is->is_rawip_mib.rawipOutErrors;
5829 	netstack_rele(ns);
5830 	return (0);
5831 }
5832 
5833 /* ARGSUSED */
5834 int
5835 rawip_accept(sock_lower_handle_t lproto_handle,
5836     sock_lower_handle_t eproto_handle, sock_upper_handle_t sock_handle,
5837     cred_t *cr)
5838 {
5839 	return (EOPNOTSUPP);
5840 }
5841 
5842 /* ARGSUSED */
5843 int
5844 rawip_bind(sock_lower_handle_t proto_handle, struct sockaddr *sa,
5845     socklen_t len, cred_t *cr)
5846 {
5847 	conn_t  *connp = (conn_t *)proto_handle;
5848 	int error;
5849 
5850 	/* All Solaris components should pass a cred for this operation. */
5851 	ASSERT(cr != NULL);
5852 
5853 	/* Binding to a NULL address really means unbind */
5854 	if (sa == NULL)
5855 		error = rawip_do_unbind(connp);
5856 	else
5857 		error = rawip_do_bind(connp, sa, len);
5858 
5859 	if (error < 0) {
5860 		if (error == -TOUTSTATE)
5861 			error = EINVAL;
5862 		else
5863 			error = proto_tlitosyserr(-error);
5864 	}
5865 	return (error);
5866 }
5867 
5868 static int
5869 rawip_implicit_bind(conn_t *connp)
5870 {
5871 	sin6_t sin6addr;
5872 	sin_t *sin;
5873 	sin6_t *sin6;
5874 	socklen_t len;
5875 	int error;
5876 
5877 	if (connp->conn_icmp->icmp_family == AF_INET) {
5878 		len = sizeof (struct sockaddr_in);
5879 		sin = (sin_t *)&sin6addr;
5880 		*sin = sin_null;
5881 		sin->sin_family = AF_INET;
5882 		sin->sin_addr.s_addr = INADDR_ANY;
5883 	} else {
5884 		ASSERT(connp->conn_icmp->icmp_family == AF_INET6);
5885 		len = sizeof (sin6_t);
5886 		sin6 = (sin6_t *)&sin6addr;
5887 		*sin6 = sin6_null;
5888 		sin6->sin6_family = AF_INET6;
5889 		V6_SET_ZERO(sin6->sin6_addr);
5890 	}
5891 
5892 	error = rawip_do_bind(connp, (struct sockaddr *)&sin6addr, len);
5893 
5894 	return ((error < 0) ? proto_tlitosyserr(-error) : error);
5895 }
5896 
5897 static int
5898 rawip_unbind(conn_t *connp)
5899 {
5900 	int error;
5901 
5902 	error = rawip_do_unbind(connp);
5903 	if (error < 0) {
5904 		error = proto_tlitosyserr(-error);
5905 	}
5906 	return (error);
5907 }
5908 
5909 /* ARGSUSED */
5910 int
5911 rawip_listen(sock_lower_handle_t proto_handle, int backlog, cred_t *cr)
5912 {
5913 	return (EOPNOTSUPP);
5914 }
5915 
5916 /* ARGSUSED */
5917 int
5918 rawip_connect(sock_lower_handle_t proto_handle, const struct sockaddr *sa,
5919     socklen_t len, sock_connid_t *id, cred_t *cr)
5920 {
5921 	conn_t	*connp = (conn_t *)proto_handle;
5922 	icmp_t *icmp = connp->conn_icmp;
5923 	int	error;
5924 	boolean_t did_bind = B_FALSE;
5925 
5926 	/* All Solaris components should pass a cred for this operation. */
5927 	ASSERT(cr != NULL);
5928 
5929 	if (sa == NULL) {
5930 		/*
5931 		 * Disconnect
5932 		 * Make sure we are connected
5933 		 */
5934 		if (icmp->icmp_state != TS_DATA_XFER)
5935 			return (EINVAL);
5936 
5937 		error = icmp_disconnect(connp);
5938 		return (error);
5939 	}
5940 
5941 	error = proto_verify_ip_addr(icmp->icmp_family, sa, len);
5942 	if (error != 0)
5943 		return (error);
5944 
5945 	/* do an implicit bind if necessary */
5946 	if (icmp->icmp_state == TS_UNBND) {
5947 		error = rawip_implicit_bind(connp);
5948 		/*
5949 		 * We could be racing with an actual bind, in which case
5950 		 * we would see EPROTO. We cross our fingers and try
5951 		 * to connect.
5952 		 */
5953 		if (!(error == 0 || error == EPROTO))
5954 			return (error);
5955 		did_bind = B_TRUE;
5956 	}
5957 
5958 	/*
5959 	 * set SO_DGRAM_ERRIND
5960 	 */
5961 	icmp->icmp_dgram_errind = B_TRUE;
5962 
5963 	error = rawip_do_connect(connp, sa, len, cr);
5964 
5965 	if (error != 0 && did_bind) {
5966 		int unbind_err;
5967 
5968 		unbind_err = rawip_unbind(connp);
5969 		ASSERT(unbind_err == 0);
5970 	}
5971 
5972 	if (error == 0) {
5973 		*id = 0;
5974 		(*connp->conn_upcalls->su_connected)
5975 		    (connp->conn_upper_handle, 0, NULL, -1);
5976 	} else if (error < 0) {
5977 		error = proto_tlitosyserr(-error);
5978 	}
5979 	return (error);
5980 }
5981 
5982 /* ARGSUSED */
5983 int
5984 rawip_fallback(sock_lower_handle_t proto_handle, queue_t *q,
5985     boolean_t direct_sockfs, so_proto_quiesced_cb_t quiesced_cb)
5986 {
5987 	conn_t  *connp = (conn_t *)proto_handle;
5988 	icmp_t	*icmp;
5989 	struct T_capability_ack tca;
5990 	struct sockaddr_in6 laddr, faddr;
5991 	socklen_t laddrlen, faddrlen;
5992 	short opts;
5993 	struct stroptions *stropt;
5994 	mblk_t *stropt_mp;
5995 	int error;
5996 
5997 	icmp = connp->conn_icmp;
5998 
5999 	stropt_mp = allocb_wait(sizeof (*stropt), BPRI_HI, STR_NOSIG, NULL);
6000 
6001 	/*
6002 	 * setup the fallback stream that was allocated
6003 	 */
6004 	connp->conn_dev = (dev_t)RD(q)->q_ptr;
6005 	connp->conn_minor_arena = WR(q)->q_ptr;
6006 
6007 	RD(q)->q_ptr = WR(q)->q_ptr = connp;
6008 
6009 	WR(q)->q_qinfo = &icmpwinit;
6010 
6011 	connp->conn_rq = RD(q);
6012 	connp->conn_wq = WR(q);
6013 
6014 	/* Notify stream head about options before sending up data */
6015 	stropt_mp->b_datap->db_type = M_SETOPTS;
6016 	stropt_mp->b_wptr += sizeof (*stropt);
6017 	stropt = (struct stroptions *)stropt_mp->b_rptr;
6018 	stropt->so_flags = SO_WROFF | SO_HIWAT;
6019 	stropt->so_wroff =
6020 	    (ushort_t)(icmp->icmp_max_hdr_len + icmp->icmp_is->is_wroff_extra);
6021 	stropt->so_hiwat = icmp->icmp_recv_hiwat;
6022 	putnext(RD(q), stropt_mp);
6023 
6024 	/*
6025 	 * free helper stream
6026 	 */
6027 	ip_free_helper_stream(connp);
6028 
6029 	/*
6030 	 * Collect the information needed to sync with the sonode
6031 	 */
6032 	icmp_do_capability_ack(icmp, &tca, TC1_INFO);
6033 
6034 	laddrlen = faddrlen = sizeof (sin6_t);
6035 	(void) rawip_getsockname((sock_lower_handle_t)connp,
6036 	    (struct sockaddr *)&laddr, &laddrlen, CRED());
6037 	error = rawip_getpeername((sock_lower_handle_t)connp,
6038 	    (struct sockaddr *)&faddr, &faddrlen, CRED());
6039 	if (error != 0)
6040 		faddrlen = 0;
6041 	opts = 0;
6042 	if (icmp->icmp_dgram_errind)
6043 		opts |= SO_DGRAM_ERRIND;
6044 	if (icmp->icmp_dontroute)
6045 		opts |= SO_DONTROUTE;
6046 
6047 	(*quiesced_cb)(connp->conn_upper_handle, q, &tca,
6048 	    (struct sockaddr *)&laddr, laddrlen,
6049 	    (struct sockaddr *)&faddr, faddrlen, opts);
6050 
6051 	/*
6052 	 * Attempts to send data up during fallback will result in it being
6053 	 * queued in udp_t. Now we push up any queued packets.
6054 	 */
6055 	mutex_enter(&icmp->icmp_recv_lock);
6056 	while (icmp->icmp_fallback_queue_head != NULL) {
6057 		mblk_t	*mp;
6058 
6059 		mp = icmp->icmp_fallback_queue_head;
6060 		icmp->icmp_fallback_queue_head = mp->b_next;
6061 		mp->b_next = NULL;
6062 		mutex_exit(&icmp->icmp_recv_lock);
6063 		putnext(RD(q), mp);
6064 		mutex_enter(&icmp->icmp_recv_lock);
6065 	}
6066 	icmp->icmp_fallback_queue_tail = icmp->icmp_fallback_queue_head;
6067 
6068 	/*
6069 	 * No longer a streams less socket
6070 	 */
6071 	rw_enter(&icmp->icmp_rwlock, RW_WRITER);
6072 	connp->conn_flags &= ~IPCL_NONSTR;
6073 	rw_exit(&icmp->icmp_rwlock);
6074 
6075 	mutex_exit(&icmp->icmp_recv_lock);
6076 
6077 	ASSERT(icmp->icmp_fallback_queue_head == NULL &&
6078 	    icmp->icmp_fallback_queue_tail == NULL);
6079 
6080 	ASSERT(connp->conn_ref >= 1);
6081 
6082 	return (0);
6083 }
6084 
6085 /* ARGSUSED */
6086 sock_lower_handle_t
6087 rawip_create(int family, int type, int proto, sock_downcalls_t **sock_downcalls,
6088     uint_t *smodep, int *errorp, int flags, cred_t *credp)
6089 {
6090 	conn_t *connp;
6091 
6092 	if (type != SOCK_RAW || (family != AF_INET && family != AF_INET6)) {
6093 		*errorp = EPROTONOSUPPORT;
6094 		return (NULL);
6095 	}
6096 
6097 	connp = icmp_open(family, credp, errorp, flags);
6098 	if (connp != NULL) {
6099 		icmp_stack_t *is;
6100 
6101 		is = connp->conn_icmp->icmp_is;
6102 		connp->conn_flags |= IPCL_NONSTR;
6103 
6104 		if (connp->conn_icmp->icmp_family == AF_INET6) {
6105 			/* Build initial header template for transmit */
6106 			rw_enter(&connp->conn_icmp->icmp_rwlock, RW_WRITER);
6107 			if ((*errorp =
6108 			    icmp_build_hdrs(connp->conn_icmp)) != 0) {
6109 				rw_exit(&connp->conn_icmp->icmp_rwlock);
6110 				ipcl_conn_destroy(connp);
6111 				return (NULL);
6112 			}
6113 			rw_exit(&connp->conn_icmp->icmp_rwlock);
6114 		}
6115 
6116 		connp->conn_icmp->icmp_recv_hiwat = is->is_recv_hiwat;
6117 		connp->conn_icmp->icmp_xmit_hiwat = is->is_xmit_hiwat;
6118 
6119 		if ((*errorp = ip_create_helper_stream(connp,
6120 		    is->is_ldi_ident)) != 0) {
6121 			cmn_err(CE_CONT, "create of IP helper stream failed\n");
6122 			(void) rawip_do_close(connp);
6123 			return (NULL);
6124 		}
6125 
6126 		mutex_enter(&connp->conn_lock);
6127 		connp->conn_state_flags &= ~CONN_INCIPIENT;
6128 		mutex_exit(&connp->conn_lock);
6129 		*sock_downcalls = &sock_rawip_downcalls;
6130 		*smodep = SM_ATOMIC;
6131 	} else {
6132 		ASSERT(*errorp != 0);
6133 	}
6134 
6135 	return ((sock_lower_handle_t)connp);
6136 }
6137 
6138 /* ARGSUSED */
6139 void
6140 rawip_activate(sock_lower_handle_t proto_handle,
6141     sock_upper_handle_t sock_handle, sock_upcalls_t *sock_upcalls, int flags,
6142     cred_t *cr)
6143 {
6144 	conn_t 			*connp = (conn_t *)proto_handle;
6145 	icmp_stack_t 		*is = connp->conn_icmp->icmp_is;
6146 	struct sock_proto_props sopp;
6147 
6148 	/* All Solaris components should pass a cred for this operation. */
6149 	ASSERT(cr != NULL);
6150 
6151 	connp->conn_upcalls = sock_upcalls;
6152 	connp->conn_upper_handle = sock_handle;
6153 
6154 	sopp.sopp_flags = SOCKOPT_WROFF | SOCKOPT_RCVHIWAT | SOCKOPT_RCVLOWAT |
6155 	    SOCKOPT_MAXBLK | SOCKOPT_MAXPSZ | SOCKOPT_MINPSZ;
6156 	sopp.sopp_wroff = connp->conn_icmp->icmp_max_hdr_len +
6157 	    is->is_wroff_extra;
6158 	sopp.sopp_rxhiwat = is->is_recv_hiwat;
6159 	sopp.sopp_rxlowat = icmp_mod_info.mi_lowat;
6160 	sopp.sopp_maxblk = INFPSZ;
6161 	sopp.sopp_maxpsz = IP_MAXPACKET;
6162 	sopp.sopp_minpsz = (icmp_mod_info.mi_minpsz == 1) ? 0 :
6163 	    icmp_mod_info.mi_minpsz;
6164 
6165 	(*connp->conn_upcalls->su_set_proto_props)
6166 	    (connp->conn_upper_handle, &sopp);
6167 }
6168 
6169 static int
6170 rawip_do_getsockname(icmp_t *icmp, struct sockaddr *sa, uint_t *salenp)
6171 {
6172 	sin_t	*sin = (sin_t *)sa;
6173 	sin6_t	*sin6 = (sin6_t *)sa;
6174 
6175 	ASSERT(icmp != NULL);
6176 	ASSERT(RW_LOCK_HELD(&icmp->icmp_rwlock));
6177 
6178 	switch (icmp->icmp_family) {
6179 	case AF_INET:
6180 		ASSERT(icmp->icmp_ipversion == IPV4_VERSION);
6181 		if (*salenp < sizeof (sin_t))
6182 			return (EINVAL);
6183 
6184 		*salenp = sizeof (sin_t);
6185 		*sin = sin_null;
6186 		sin->sin_family = AF_INET;
6187 		if (icmp->icmp_state == TS_UNBND) {
6188 			break;
6189 		}
6190 
6191 		if (!IN6_IS_ADDR_V4MAPPED_ANY(&icmp->icmp_v6src) &&
6192 		    !IN6_IS_ADDR_UNSPECIFIED(&icmp->icmp_v6src)) {
6193 			sin->sin_addr.s_addr = V4_PART_OF_V6(icmp->icmp_v6src);
6194 		} else {
6195 			/*
6196 			 * INADDR_ANY
6197 			 * icmp_v6src is not set, we might be bound to
6198 			 * broadcast/multicast. Use icmp_bound_v6src as
6199 			 * local address instead (that could
6200 			 * also still be INADDR_ANY)
6201 			 */
6202 			sin->sin_addr.s_addr =
6203 			    V4_PART_OF_V6(icmp->icmp_bound_v6src);
6204 		}
6205 		break;
6206 	case AF_INET6:
6207 
6208 		if (*salenp < sizeof (sin6_t))
6209 			return (EINVAL);
6210 
6211 		*salenp = sizeof (sin6_t);
6212 		*sin6 = sin6_null;
6213 		sin6->sin6_family = AF_INET6;
6214 		if (icmp->icmp_state == TS_UNBND) {
6215 			break;
6216 		}
6217 		if (!IN6_IS_ADDR_UNSPECIFIED(&icmp->icmp_v6src)) {
6218 			sin6->sin6_addr = icmp->icmp_v6src;
6219 		} else {
6220 			/*
6221 			 * UNSPECIFIED
6222 			 * icmp_v6src is not set, we might be bound to
6223 			 * broadcast/multicast. Use icmp_bound_v6src as
6224 			 * local address instead (that could
6225 			 * also still be UNSPECIFIED)
6226 			 */
6227 
6228 			sin6->sin6_addr = icmp->icmp_bound_v6src;
6229 		}
6230 		break;
6231 	}
6232 	return (0);
6233 }
6234 
6235 static int
6236 rawip_do_getpeername(icmp_t *icmp, struct sockaddr *sa, uint_t *salenp)
6237 {
6238 	sin_t   *sin = (sin_t *)sa;
6239 	sin6_t  *sin6 = (sin6_t *)sa;
6240 
6241 	ASSERT(icmp != NULL);
6242 	ASSERT(RW_LOCK_HELD(&icmp->icmp_rwlock));
6243 
6244 	if (icmp->icmp_state != TS_DATA_XFER)
6245 		return (ENOTCONN);
6246 
6247 	sa->sa_family = icmp->icmp_family;
6248 	switch (icmp->icmp_family) {
6249 	case AF_INET:
6250 		ASSERT(icmp->icmp_ipversion == IPV4_VERSION);
6251 
6252 		if (*salenp < sizeof (sin_t))
6253 			return (EINVAL);
6254 
6255 		*salenp = sizeof (sin_t);
6256 		*sin = sin_null;
6257 		sin->sin_family = AF_INET;
6258 		sin->sin_addr.s_addr =
6259 		    V4_PART_OF_V6(icmp->icmp_v6dst.sin6_addr);
6260 		break;
6261 	case AF_INET6:
6262 		if (*salenp < sizeof (sin6_t))
6263 			return (EINVAL);
6264 
6265 		*salenp = sizeof (sin6_t);
6266 		*sin6 = sin6_null;
6267 		*sin6 = icmp->icmp_v6dst;
6268 		break;
6269 	}
6270 	return (0);
6271 }
6272 
6273 /* ARGSUSED */
6274 int
6275 rawip_getpeername(sock_lower_handle_t proto_handle, struct sockaddr *sa,
6276     socklen_t *salenp, cred_t *cr)
6277 {
6278 	conn_t  *connp = (conn_t *)proto_handle;
6279 	icmp_t  *icmp = connp->conn_icmp;
6280 	int	error;
6281 
6282 	/* All Solaris components should pass a cred for this operation. */
6283 	ASSERT(cr != NULL);
6284 
6285 	ASSERT(icmp != NULL);
6286 
6287 	rw_enter(&icmp->icmp_rwlock, RW_READER);
6288 
6289 	error = rawip_do_getpeername(icmp, sa, salenp);
6290 
6291 	rw_exit(&icmp->icmp_rwlock);
6292 
6293 	return (error);
6294 }
6295 
6296 /* ARGSUSED */
6297 int
6298 rawip_getsockname(sock_lower_handle_t proto_handle, struct sockaddr *sa,
6299     socklen_t *salenp, cred_t *cr)
6300 {
6301 	conn_t  *connp = (conn_t *)proto_handle;
6302 	icmp_t	*icmp = connp->conn_icmp;
6303 	int	error;
6304 
6305 	/* All Solaris components should pass a cred for this operation. */
6306 	ASSERT(cr != NULL);
6307 
6308 	ASSERT(icmp != NULL);
6309 	rw_enter(&icmp->icmp_rwlock, RW_READER);
6310 
6311 	error = rawip_do_getsockname(icmp, sa, salenp);
6312 
6313 	rw_exit(&icmp->icmp_rwlock);
6314 
6315 	return (error);
6316 }
6317 
6318 int
6319 rawip_setsockopt(sock_lower_handle_t proto_handle, int level, int option_name,
6320     const void *optvalp, socklen_t optlen, cred_t *cr)
6321 {
6322 	conn_t	*connp = (conn_t *)proto_handle;
6323 	icmp_t *icmp = connp->conn_icmp;
6324 	int error;
6325 
6326 	/* All Solaris components should pass a cred for this operation. */
6327 	ASSERT(cr != NULL);
6328 
6329 	error = proto_opt_check(level, option_name, optlen, NULL,
6330 	    icmp_opt_obj.odb_opt_des_arr,
6331 	    icmp_opt_obj.odb_opt_arr_cnt,
6332 	    icmp_opt_obj.odb_topmost_tpiprovider,
6333 	    B_TRUE, B_FALSE, cr);
6334 
6335 	if (error != 0) {
6336 		/*
6337 		 * option not recognized
6338 		 */
6339 		if (error < 0) {
6340 			error = proto_tlitosyserr(-error);
6341 		}
6342 		return (error);
6343 	}
6344 
6345 	rw_enter(&icmp->icmp_rwlock, RW_WRITER);
6346 	error = icmp_opt_set(connp, SETFN_OPTCOM_NEGOTIATE, level,
6347 	    option_name, optlen, (uchar_t *)optvalp, (uint_t *)&optlen,
6348 	    (uchar_t *)optvalp, NULL, cr);
6349 	rw_exit(&icmp->icmp_rwlock);
6350 
6351 	if (error < 0) {
6352 		/*
6353 		 * Pass on to ip
6354 		 */
6355 		error = ip_set_options(connp, level, option_name, optvalp,
6356 		    optlen, cr);
6357 	}
6358 
6359 	ASSERT(error >= 0);
6360 
6361 	return (error);
6362 }
6363 
6364 int
6365 rawip_getsockopt(sock_lower_handle_t proto_handle, int level, int option_name,
6366     void *optvalp, socklen_t *optlen, cred_t *cr)
6367 {
6368 	int		error;
6369 	conn_t		*connp = (conn_t *)proto_handle;
6370 	icmp_t		*icmp = connp->conn_icmp;
6371 	t_uscalar_t	max_optbuf_len;
6372 	void		*optvalp_buf;
6373 	int		len;
6374 
6375 	/* All Solaris components should pass a cred for this operation. */
6376 	ASSERT(cr != NULL);
6377 
6378 	error = proto_opt_check(level, option_name, *optlen, &max_optbuf_len,
6379 	    icmp_opt_obj.odb_opt_des_arr,
6380 	    icmp_opt_obj.odb_opt_arr_cnt,
6381 	    icmp_opt_obj.odb_topmost_tpiprovider,
6382 	    B_FALSE, B_TRUE, cr);
6383 
6384 	if (error != 0) {
6385 		if (error < 0) {
6386 			error = proto_tlitosyserr(-error);
6387 		}
6388 		return (error);
6389 	}
6390 
6391 	optvalp_buf = kmem_alloc(max_optbuf_len, KM_SLEEP);
6392 	rw_enter(&icmp->icmp_rwlock, RW_READER);
6393 	len = icmp_opt_get(connp, level, option_name, optvalp_buf);
6394 	rw_exit(&icmp->icmp_rwlock);
6395 
6396 	if (len < 0) {
6397 		/*
6398 		 * Pass on to IP
6399 		 */
6400 		kmem_free(optvalp_buf, max_optbuf_len);
6401 		return (ip_get_options(connp, level, option_name, optvalp,
6402 		    optlen, cr));
6403 	} else {
6404 		/*
6405 		 * update optlen and copy option value
6406 		 */
6407 		t_uscalar_t size = MIN(len, *optlen);
6408 		bcopy(optvalp_buf, optvalp, size);
6409 		bcopy(&size, optlen, sizeof (size));
6410 
6411 		kmem_free(optvalp_buf, max_optbuf_len);
6412 		return (0);
6413 	}
6414 }
6415 
6416 /* ARGSUSED */
6417 int
6418 rawip_close(sock_lower_handle_t proto_handle, int flags, cred_t *cr)
6419 {
6420 	conn_t	*connp = (conn_t *)proto_handle;
6421 
6422 	/* All Solaris components should pass a cred for this operation. */
6423 	ASSERT(cr != NULL);
6424 
6425 	(void) rawip_do_close(connp);
6426 	return (0);
6427 }
6428 
6429 /* ARGSUSED */
6430 int
6431 rawip_shutdown(sock_lower_handle_t proto_handle, int how, cred_t *cr)
6432 {
6433 	conn_t  *connp = (conn_t *)proto_handle;
6434 
6435 	/* All Solaris components should pass a cred for this operation. */
6436 	ASSERT(cr != NULL);
6437 
6438 	/* shut down the send side */
6439 	if (how != SHUT_RD)
6440 		(*connp->conn_upcalls->su_opctl)(connp->conn_upper_handle,
6441 		    SOCK_OPCTL_SHUT_SEND, 0);
6442 	/* shut down the recv side */
6443 	if (how != SHUT_WR)
6444 		(*connp->conn_upcalls->su_opctl)(connp->conn_upper_handle,
6445 		    SOCK_OPCTL_SHUT_RECV, 0);
6446 	return (0);
6447 }
6448 
6449 void
6450 rawip_clr_flowctrl(sock_lower_handle_t proto_handle)
6451 {
6452 	conn_t  *connp = (conn_t *)proto_handle;
6453 	icmp_t	*icmp = connp->conn_icmp;
6454 
6455 	mutex_enter(&icmp->icmp_recv_lock);
6456 	connp->conn_flow_cntrld = B_FALSE;
6457 	mutex_exit(&icmp->icmp_recv_lock);
6458 }
6459 
6460 int
6461 rawip_ioctl(sock_lower_handle_t proto_handle, int cmd, intptr_t arg,
6462     int mode, int32_t *rvalp, cred_t *cr)
6463 {
6464 	conn_t  	*connp = (conn_t *)proto_handle;
6465 	int		error;
6466 
6467 	/* All Solaris components should pass a cred for this operation. */
6468 	ASSERT(cr != NULL);
6469 
6470 	switch (cmd) {
6471 	case ND_SET:
6472 	case ND_GET:
6473 	case _SIOCSOCKFALLBACK:
6474 	case TI_GETPEERNAME:
6475 	case TI_GETMYNAME:
6476 #ifdef DEBUG
6477 		cmn_err(CE_CONT, "icmp_ioctl cmd 0x%x on non streams"
6478 		    " socket", cmd);
6479 #endif
6480 		error = EINVAL;
6481 		break;
6482 	default:
6483 		/*
6484 		 * Pass on to IP using helper stream
6485 		 */
6486 		error = ldi_ioctl(connp->conn_helper_info->iphs_handle,
6487 		    cmd, arg, mode, cr, rvalp);
6488 		break;
6489 	}
6490 	return (error);
6491 }
6492 
6493 /* ARGSUSED */
6494 int
6495 rawip_send(sock_lower_handle_t proto_handle, mblk_t *mp, struct nmsghdr *msg,
6496     cred_t *cr)
6497 {
6498 	conn_t *connp = (conn_t *)proto_handle;
6499 	icmp_t	*icmp = connp->conn_icmp;
6500 	icmp_stack_t *is = icmp->icmp_is;
6501 	int error = 0;
6502 	boolean_t bypass_dgram_errind = B_FALSE;
6503 
6504 	ASSERT(DB_TYPE(mp) == M_DATA);
6505 
6506 	/* All Solaris components should pass a cred for this operation. */
6507 	ASSERT(cr != NULL);
6508 
6509 	/* If labeled then sockfs should have already set db_credp */
6510 	ASSERT(!is_system_labeled() || msg_getcred(mp, NULL) != NULL);
6511 
6512 	/* do an implicit bind if necessary */
6513 	if (icmp->icmp_state == TS_UNBND) {
6514 		error = rawip_implicit_bind(connp);
6515 		/*
6516 		 * We could be racing with an actual bind, in which case
6517 		 * we would see EPROTO. We cross our fingers and try
6518 		 * to connect.
6519 		 */
6520 		if (!(error == 0 || error == EPROTO)) {
6521 			freemsg(mp);
6522 			return (error);
6523 		}
6524 	}
6525 
6526 	rw_enter(&icmp->icmp_rwlock, RW_WRITER);
6527 
6528 	if (msg->msg_name != NULL && icmp->icmp_state == TS_DATA_XFER) {
6529 		error = EISCONN;
6530 		goto done_lock;
6531 	}
6532 
6533 	switch (icmp->icmp_family) {
6534 	case AF_INET6: {
6535 		sin6_t	*sin6;
6536 		ip6_pkt_t	ipp_s;	/* For ancillary data options */
6537 		ip6_pkt_t	*ipp = &ipp_s;
6538 
6539 		sin6 = (sin6_t *)msg->msg_name;
6540 		if (sin6 != NULL) {
6541 			error = proto_verify_ip_addr(icmp->icmp_family,
6542 			    (struct sockaddr *)msg->msg_name, msg->msg_namelen);
6543 			if (error != 0) {
6544 				bypass_dgram_errind = B_TRUE;
6545 				goto done_lock;
6546 			}
6547 			if (icmp->icmp_delayed_error != 0) {
6548 				sin6_t  *sin1 = (sin6_t *)msg->msg_name;
6549 				sin6_t  *sin2 = (sin6_t *)
6550 				    &icmp->icmp_delayed_addr;
6551 
6552 				error = icmp->icmp_delayed_error;
6553 				icmp->icmp_delayed_error = 0;
6554 
6555 				/* Compare IP address and port */
6556 
6557 				if (sin1->sin6_port == sin2->sin6_port &&
6558 				    IN6_ARE_ADDR_EQUAL(&sin1->sin6_addr,
6559 				    &sin2->sin6_addr)) {
6560 					goto done_lock;
6561 				}
6562 			}
6563 		} else {
6564 			/*
6565 			 * Use connected address
6566 			 */
6567 			if (icmp->icmp_state != TS_DATA_XFER) {
6568 				BUMP_MIB(&is->is_rawip_mib, rawipOutErrors);
6569 				error = EDESTADDRREQ;
6570 				bypass_dgram_errind = B_TRUE;
6571 				goto done_lock;
6572 			}
6573 			sin6 = &icmp->icmp_v6dst;
6574 		}
6575 
6576 		/* No support for mapped addresses on raw sockets */
6577 		if (IN6_IS_ADDR_V4MAPPED(&sin6->sin6_addr)) {
6578 			BUMP_MIB(&is->is_rawip_mib, rawipOutErrors);
6579 			error = EADDRNOTAVAIL;
6580 			goto done_lock;
6581 		}
6582 
6583 		ipp->ipp_fields = 0;
6584 		ipp->ipp_sticky_ignored = 0;
6585 
6586 		/*
6587 		 * If options passed in, feed it for verification and handling
6588 		 */
6589 		if (msg->msg_controllen != 0) {
6590 			error = process_auxiliary_options(connp,
6591 			    msg->msg_control, msg->msg_controllen,
6592 			    ipp, &icmp_opt_obj, icmp_opt_set, cr);
6593 			if (error != 0) {
6594 				goto done_lock;
6595 			}
6596 		}
6597 
6598 		rw_exit(&icmp->icmp_rwlock);
6599 
6600 		/*
6601 		 * Destination is a native IPv6 address.
6602 		 * Send out an IPv6 format packet.
6603 		 */
6604 
6605 		error = raw_ip_send_data_v6(connp->conn_wq, connp, mp, sin6,
6606 		    ipp);
6607 	}
6608 		break;
6609 	case AF_INET: {
6610 		sin_t	*sin;
6611 		ip4_pkt_t pktinfo;
6612 		ip4_pkt_t *pktinfop = &pktinfo;
6613 		ipaddr_t	v4dst;
6614 
6615 		sin = (sin_t *)msg->msg_name;
6616 		if (sin != NULL) {
6617 			error = proto_verify_ip_addr(icmp->icmp_family,
6618 			    (struct sockaddr *)msg->msg_name, msg->msg_namelen);
6619 			if (error != 0) {
6620 				bypass_dgram_errind = B_TRUE;
6621 				goto done_lock;
6622 			}
6623 			v4dst = sin->sin_addr.s_addr;
6624 			if (icmp->icmp_delayed_error != 0) {
6625 				sin_t *sin1 = (sin_t *)msg->msg_name;
6626 				sin_t *sin2 = (sin_t *)&icmp->icmp_delayed_addr;
6627 
6628 				error = icmp->icmp_delayed_error;
6629 				icmp->icmp_delayed_error = 0;
6630 
6631 				/* Compare IP address and port */
6632 				if (sin1->sin_port == sin2->sin_port &&
6633 				    sin1->sin_addr.s_addr ==
6634 				    sin2->sin_addr.s_addr) {
6635 					goto done_lock;
6636 				}
6637 
6638 			}
6639 		} else {
6640 			/*
6641 			 * Use connected address
6642 			 */
6643 			if (icmp->icmp_state != TS_DATA_XFER) {
6644 				BUMP_MIB(&is->is_rawip_mib, rawipOutErrors);
6645 				error = EDESTADDRREQ;
6646 				bypass_dgram_errind = B_TRUE;
6647 				goto done_lock;
6648 			}
6649 			v4dst = V4_PART_OF_V6(icmp->icmp_v6dst.sin6_addr);
6650 		}
6651 
6652 
6653 		pktinfop->ip4_ill_index = 0;
6654 		pktinfop->ip4_addr = INADDR_ANY;
6655 
6656 		/*
6657 		 * If options passed in, feed it for verification and handling
6658 		 */
6659 		if (msg->msg_controllen != 0) {
6660 			error = process_auxiliary_options(connp,
6661 			    msg->msg_control, msg->msg_controllen,
6662 			    pktinfop, &icmp_opt_obj, icmp_opt_set, cr);
6663 			if (error != 0) {
6664 				goto done_lock;
6665 			}
6666 		}
6667 		rw_exit(&icmp->icmp_rwlock);
6668 
6669 		error = raw_ip_send_data_v4(connp->conn_wq, connp, mp,
6670 		    v4dst, pktinfop);
6671 		break;
6672 	}
6673 
6674 	default:
6675 		ASSERT(0);
6676 	}
6677 
6678 	goto done;
6679 
6680 done_lock:
6681 	rw_exit(&icmp->icmp_rwlock);
6682 	if (error != 0) {
6683 		ASSERT(mp != NULL);
6684 		freemsg(mp);
6685 	}
6686 done:
6687 	if (bypass_dgram_errind)
6688 		return (error);
6689 	return (icmp->icmp_dgram_errind ? error : 0);
6690 }
6691 
6692 sock_downcalls_t sock_rawip_downcalls = {
6693 	rawip_activate,
6694 	rawip_accept,
6695 	rawip_bind,
6696 	rawip_listen,
6697 	rawip_connect,
6698 	rawip_getpeername,
6699 	rawip_getsockname,
6700 	rawip_getsockopt,
6701 	rawip_setsockopt,
6702 	rawip_send,
6703 	NULL,
6704 	NULL,
6705 	NULL,
6706 	rawip_shutdown,
6707 	rawip_clr_flowctrl,
6708 	rawip_ioctl,
6709 	rawip_close
6710 };
6711