xref: /illumos-gate/usr/src/uts/common/inet/ip/icmp.c (revision 269e59f9a28bf47e0f463e64fc5af4a408b73b21)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright (c) 1991, 2010, Oracle and/or its affiliates. All rights reserved.
23  */
24 /* Copyright (c) 1990 Mentat Inc. */
25 
26 #include <sys/types.h>
27 #include <sys/stream.h>
28 #include <sys/stropts.h>
29 #include <sys/strlog.h>
30 #include <sys/strsun.h>
31 #define	_SUN_TPI_VERSION 2
32 #include <sys/tihdr.h>
33 #include <sys/timod.h>
34 #include <sys/ddi.h>
35 #include <sys/sunddi.h>
36 #include <sys/strsubr.h>
37 #include <sys/suntpi.h>
38 #include <sys/xti_inet.h>
39 #include <sys/cmn_err.h>
40 #include <sys/kmem.h>
41 #include <sys/cred.h>
42 #include <sys/policy.h>
43 #include <sys/priv.h>
44 #include <sys/ucred.h>
45 #include <sys/zone.h>
46 
47 #include <sys/sockio.h>
48 #include <sys/socket.h>
49 #include <sys/socketvar.h>
50 #include <sys/vtrace.h>
51 #include <sys/sdt.h>
52 #include <sys/debug.h>
53 #include <sys/isa_defs.h>
54 #include <sys/random.h>
55 #include <netinet/in.h>
56 #include <netinet/ip6.h>
57 #include <netinet/icmp6.h>
58 #include <netinet/udp.h>
59 
60 #include <inet/common.h>
61 #include <inet/ip.h>
62 #include <inet/ip_impl.h>
63 #include <inet/ipsec_impl.h>
64 #include <inet/ip6.h>
65 #include <inet/ip_ire.h>
66 #include <inet/ip_if.h>
67 #include <inet/ip_multi.h>
68 #include <inet/ip_ndp.h>
69 #include <inet/proto_set.h>
70 #include <inet/mib2.h>
71 #include <inet/nd.h>
72 #include <inet/optcom.h>
73 #include <inet/snmpcom.h>
74 #include <inet/kstatcom.h>
75 #include <inet/ipclassifier.h>
76 
77 #include <sys/tsol/label.h>
78 #include <sys/tsol/tnet.h>
79 
80 #include <inet/rawip_impl.h>
81 
82 #include <sys/disp.h>
83 
84 /*
85  * Synchronization notes:
86  *
87  * RAWIP is MT and uses the usual kernel synchronization primitives. We use
88  * conn_lock to protect the icmp_t.
89  *
90  * Plumbing notes:
91  * ICMP is always a device driver. For compatibility with mibopen() code
92  * it is possible to I_PUSH "icmp", but that results in pushing a passthrough
93  * dummy module.
94  */
95 
96 static void	icmp_addr_req(queue_t *q, mblk_t *mp);
97 static void	icmp_tpi_bind(queue_t *q, mblk_t *mp);
98 static void	icmp_bind_proto(icmp_t *icmp);
99 static int	icmp_build_hdr_template(conn_t *, const in6_addr_t *,
100     const in6_addr_t *, uint32_t);
101 static void	icmp_capability_req(queue_t *q, mblk_t *mp);
102 static int	icmp_close(queue_t *q, int flags);
103 static void	icmp_close_free(conn_t *);
104 static void	icmp_tpi_connect(queue_t *q, mblk_t *mp);
105 static void	icmp_tpi_disconnect(queue_t *q, mblk_t *mp);
106 static void	icmp_err_ack(queue_t *q, mblk_t *mp, t_scalar_t t_error,
107     int sys_error);
108 static void	icmp_err_ack_prim(queue_t *q, mblk_t *mp, t_scalar_t primitive,
109     t_scalar_t tlierr, int sys_error);
110 static void	icmp_icmp_input(void *arg1, mblk_t *mp, void *arg2,
111     ip_recv_attr_t *);
112 static void	icmp_icmp_error_ipv6(conn_t *connp, mblk_t *mp,
113     ip_recv_attr_t *);
114 static void	icmp_info_req(queue_t *q, mblk_t *mp);
115 static void	icmp_input(void *, mblk_t *, void *, ip_recv_attr_t *);
116 static conn_t 	*icmp_open(int family, cred_t *credp, int *err, int flags);
117 static int	icmp_openv4(queue_t *q, dev_t *devp, int flag, int sflag,
118 		    cred_t *credp);
119 static int	icmp_openv6(queue_t *q, dev_t *devp, int flag, int sflag,
120 		    cred_t *credp);
121 static boolean_t icmp_opt_allow_udr_set(t_scalar_t level, t_scalar_t name);
122 int		icmp_opt_set(conn_t *connp, uint_t optset_context,
123 		    int level, int name, uint_t inlen,
124 		    uchar_t *invalp, uint_t *outlenp, uchar_t *outvalp,
125 		    void *thisdg_attrs, cred_t *cr);
126 int		icmp_opt_get(conn_t *connp, int level, int name,
127 		    uchar_t *ptr);
128 static int	icmp_output_newdst(conn_t *connp, mblk_t *data_mp, sin_t *sin,
129 		    sin6_t *sin6, cred_t *cr, pid_t pid, ip_xmit_attr_t *ixa);
130 static mblk_t	*icmp_prepend_hdr(conn_t *, ip_xmit_attr_t *, const ip_pkt_t *,
131     const in6_addr_t *, const in6_addr_t *, uint32_t, mblk_t *, int *);
132 static mblk_t	*icmp_prepend_header_template(conn_t *, ip_xmit_attr_t *,
133     mblk_t *, const in6_addr_t *, uint32_t, int *);
134 static int	icmp_snmp_set(queue_t *q, t_scalar_t level, t_scalar_t name,
135 		    uchar_t *ptr, int len);
136 static void	icmp_ud_err(queue_t *q, mblk_t *mp, t_scalar_t err);
137 static void	icmp_tpi_unbind(queue_t *q, mblk_t *mp);
138 static void	icmp_wput(queue_t *q, mblk_t *mp);
139 static void	icmp_wput_fallback(queue_t *q, mblk_t *mp);
140 static void	icmp_wput_other(queue_t *q, mblk_t *mp);
141 static void	icmp_wput_iocdata(queue_t *q, mblk_t *mp);
142 static void	icmp_wput_restricted(queue_t *q, mblk_t *mp);
143 static void	icmp_ulp_recv(conn_t *, mblk_t *, uint_t);
144 
145 static void	*rawip_stack_init(netstackid_t stackid, netstack_t *ns);
146 static void	rawip_stack_fini(netstackid_t stackid, void *arg);
147 
148 static void	*rawip_kstat_init(netstackid_t stackid);
149 static void	rawip_kstat_fini(netstackid_t stackid, kstat_t *ksp);
150 static int	rawip_kstat_update(kstat_t *kp, int rw);
151 static void	rawip_stack_shutdown(netstackid_t stackid, void *arg);
152 
153 /* Common routines for TPI and socket module */
154 static conn_t	*rawip_do_open(int, cred_t *, int *, int);
155 static void	rawip_do_close(conn_t *);
156 static int	rawip_do_bind(conn_t *, struct sockaddr *, socklen_t);
157 static int	rawip_do_unbind(conn_t *);
158 static int	rawip_do_connect(conn_t *, const struct sockaddr *, socklen_t,
159     cred_t *, pid_t);
160 
161 int		rawip_getsockname(sock_lower_handle_t, struct sockaddr *,
162 		    socklen_t *, cred_t *);
163 int		rawip_getpeername(sock_lower_handle_t, struct sockaddr *,
164 		    socklen_t *, cred_t *);
165 
166 static struct module_info icmp_mod_info =  {
167 	5707, "icmp", 1, INFPSZ, 512, 128
168 };
169 
170 /*
171  * Entry points for ICMP as a device.
172  * We have separate open functions for the /dev/icmp and /dev/icmp6 devices.
173  */
174 static struct qinit icmprinitv4 = {
175 	NULL, NULL, icmp_openv4, icmp_close, NULL, &icmp_mod_info
176 };
177 
178 static struct qinit icmprinitv6 = {
179 	NULL, NULL, icmp_openv6, icmp_close, NULL, &icmp_mod_info
180 };
181 
182 static struct qinit icmpwinit = {
183 	(pfi_t)icmp_wput, (pfi_t)ip_wsrv, NULL, NULL, NULL, &icmp_mod_info
184 };
185 
186 /* ICMP entry point during fallback */
187 static struct qinit icmp_fallback_sock_winit = {
188 	(pfi_t)icmp_wput_fallback, NULL, NULL, NULL, NULL, &icmp_mod_info
189 };
190 
191 /* For AF_INET aka /dev/icmp */
192 struct streamtab icmpinfov4 = {
193 	&icmprinitv4, &icmpwinit
194 };
195 
196 /* For AF_INET6 aka /dev/icmp6 */
197 struct streamtab icmpinfov6 = {
198 	&icmprinitv6, &icmpwinit
199 };
200 
201 /* Default structure copied into T_INFO_ACK messages */
202 static struct T_info_ack icmp_g_t_info_ack = {
203 	T_INFO_ACK,
204 	IP_MAXPACKET,	 /* TSDU_size.  icmp allows maximum size messages. */
205 	T_INVALID,	/* ETSDU_size.  icmp does not support expedited data. */
206 	T_INVALID,	/* CDATA_size. icmp does not support connect data. */
207 	T_INVALID,	/* DDATA_size. icmp does not support disconnect data. */
208 	0,		/* ADDR_size - filled in later. */
209 	0,		/* OPT_size - not initialized here */
210 	IP_MAXPACKET,	/* TIDU_size.  icmp allows maximum size messages. */
211 	T_CLTS,		/* SERV_type.  icmp supports connection-less. */
212 	TS_UNBND,	/* CURRENT_state.  This is set from icmp_state. */
213 	(XPG4_1|SENDZERO) /* PROVIDER_flag */
214 };
215 
216 /*
217  * All of these are alterable, within the min/max values given, at run time.
218  *
219  * Note: All those tunables which do not start with "icmp_" are Committed and
220  * therefore are public. See PSARC 2009/306.
221  */
222 static mod_prop_info_t icmp_propinfo_tbl[] = {
223 	/* tunable - 0 */
224 	{ "icmp_wroff_extra", MOD_PROTO_RAWIP,
225 	    mod_set_uint32, mod_get_uint32,
226 	    {0, 128, 32}, {32} },
227 
228 	{ "icmp_ipv4_ttl", MOD_PROTO_RAWIP,
229 	    mod_set_uint32, mod_get_uint32,
230 	    {1, 255, 255}, {255} },
231 
232 	{ "icmp_ipv6_hoplimit", MOD_PROTO_RAWIP,
233 	    mod_set_uint32, mod_get_uint32,
234 	    {0, IPV6_MAX_HOPS, IPV6_DEFAULT_HOPS},
235 	    {IPV6_DEFAULT_HOPS} },
236 
237 	{ "icmp_bsd_compat", MOD_PROTO_RAWIP,
238 	    mod_set_boolean, mod_get_boolean,
239 	    {B_TRUE}, {B_TRUE} },
240 
241 	{ "send_maxbuf", MOD_PROTO_RAWIP,
242 	    mod_set_uint32, mod_get_uint32,
243 	    {4096, 65536, 8192}, {8192} },
244 
245 	{ "icmp_xmit_lowat", MOD_PROTO_RAWIP,
246 	    mod_set_uint32, mod_get_uint32,
247 	    {0, 65536, 1024}, {1024} },
248 
249 	{ "recv_maxbuf", MOD_PROTO_RAWIP,
250 	    mod_set_uint32, mod_get_uint32,
251 	    {4096, 65536, 8192}, {8192} },
252 
253 	{ "icmp_max_buf", MOD_PROTO_RAWIP,
254 	    mod_set_uint32, mod_get_uint32,
255 	    {65536, 1024*1024*1024, 256*1024}, {256 * 1024} },
256 
257 	{ "icmp_pmtu_discovery", MOD_PROTO_RAWIP,
258 	    mod_set_boolean, mod_get_boolean,
259 	    {B_FALSE}, {B_FALSE} },
260 
261 	{ "icmp_sendto_ignerr", MOD_PROTO_RAWIP,
262 	    mod_set_boolean, mod_get_boolean,
263 	    {B_FALSE}, {B_FALSE} },
264 
265 	{ "?", MOD_PROTO_RAWIP, NULL, mod_get_allprop, {0}, {0} },
266 
267 	{ NULL, 0, NULL, NULL, {0}, {0} }
268 };
269 
270 #define	is_wroff_extra			is_propinfo_tbl[0].prop_cur_uval
271 #define	is_ipv4_ttl			is_propinfo_tbl[1].prop_cur_uval
272 #define	is_ipv6_hoplimit		is_propinfo_tbl[2].prop_cur_uval
273 #define	is_bsd_compat			is_propinfo_tbl[3].prop_cur_bval
274 #define	is_xmit_hiwat			is_propinfo_tbl[4].prop_cur_uval
275 #define	is_xmit_lowat			is_propinfo_tbl[5].prop_cur_uval
276 #define	is_recv_hiwat			is_propinfo_tbl[6].prop_cur_uval
277 #define	is_max_buf			is_propinfo_tbl[7].prop_cur_uval
278 #define	is_pmtu_discovery		is_propinfo_tbl[8].prop_cur_bval
279 #define	is_sendto_ignerr		is_propinfo_tbl[9].prop_cur_bval
280 
281 typedef union T_primitives *t_primp_t;
282 
283 /*
284  * This routine is called to handle each O_T_BIND_REQ/T_BIND_REQ message
285  * passed to icmp_wput.
286  * It calls IP to verify the local IP address, and calls IP to insert
287  * the conn_t in the fanout table.
288  * If everything is ok it then sends the T_BIND_ACK back up.
289  */
290 static void
291 icmp_tpi_bind(queue_t *q, mblk_t *mp)
292 {
293 	int	error;
294 	struct sockaddr *sa;
295 	struct T_bind_req *tbr;
296 	socklen_t	len;
297 	sin_t	*sin;
298 	sin6_t	*sin6;
299 	icmp_t		*icmp;
300 	conn_t	*connp = Q_TO_CONN(q);
301 	mblk_t *mp1;
302 	cred_t *cr;
303 
304 	/*
305 	 * All Solaris components should pass a db_credp
306 	 * for this TPI message, hence we ASSERT.
307 	 * But in case there is some other M_PROTO that looks
308 	 * like a TPI message sent by some other kernel
309 	 * component, we check and return an error.
310 	 */
311 	cr = msg_getcred(mp, NULL);
312 	ASSERT(cr != NULL);
313 	if (cr == NULL) {
314 		icmp_err_ack(q, mp, TSYSERR, EINVAL);
315 		return;
316 	}
317 
318 	icmp = connp->conn_icmp;
319 	if ((mp->b_wptr - mp->b_rptr) < sizeof (*tbr)) {
320 		(void) mi_strlog(q, 1, SL_ERROR|SL_TRACE,
321 		    "icmp_bind: bad req, len %u",
322 		    (uint_t)(mp->b_wptr - mp->b_rptr));
323 		icmp_err_ack(q, mp, TPROTO, 0);
324 		return;
325 	}
326 
327 	if (icmp->icmp_state != TS_UNBND) {
328 		(void) mi_strlog(q, 1, SL_ERROR|SL_TRACE,
329 		    "icmp_bind: bad state, %u", icmp->icmp_state);
330 		icmp_err_ack(q, mp, TOUTSTATE, 0);
331 		return;
332 	}
333 
334 	/*
335 	 * Reallocate the message to make sure we have enough room for an
336 	 * address.
337 	 */
338 	mp1 = reallocb(mp, sizeof (struct T_bind_ack) + sizeof (sin6_t), 1);
339 	if (mp1 == NULL) {
340 		icmp_err_ack(q, mp, TSYSERR, ENOMEM);
341 		return;
342 	}
343 	mp = mp1;
344 
345 	/* Reset the message type in preparation for shipping it back. */
346 	DB_TYPE(mp) = M_PCPROTO;
347 	tbr = (struct T_bind_req *)mp->b_rptr;
348 	len = tbr->ADDR_length;
349 	switch (len) {
350 	case 0:	/* request for a generic port */
351 		tbr->ADDR_offset = sizeof (struct T_bind_req);
352 		if (connp->conn_family == AF_INET) {
353 			tbr->ADDR_length = sizeof (sin_t);
354 			sin = (sin_t *)&tbr[1];
355 			*sin = sin_null;
356 			sin->sin_family = AF_INET;
357 			mp->b_wptr = (uchar_t *)&sin[1];
358 			sa = (struct sockaddr *)sin;
359 			len = sizeof (sin_t);
360 		} else {
361 			ASSERT(connp->conn_family == AF_INET6);
362 			tbr->ADDR_length = sizeof (sin6_t);
363 			sin6 = (sin6_t *)&tbr[1];
364 			*sin6 = sin6_null;
365 			sin6->sin6_family = AF_INET6;
366 			mp->b_wptr = (uchar_t *)&sin6[1];
367 			sa = (struct sockaddr *)sin6;
368 			len = sizeof (sin6_t);
369 		}
370 		break;
371 
372 	case sizeof (sin_t):	/* Complete IPv4 address */
373 		sa = (struct sockaddr *)mi_offset_param(mp, tbr->ADDR_offset,
374 		    sizeof (sin_t));
375 		break;
376 
377 	case sizeof (sin6_t):	/* Complete IPv6 address */
378 		sa = (struct sockaddr *)mi_offset_param(mp,
379 		    tbr->ADDR_offset, sizeof (sin6_t));
380 		break;
381 
382 	default:
383 		(void) mi_strlog(q, 1, SL_ERROR|SL_TRACE,
384 		    "icmp_bind: bad ADDR_length %u", tbr->ADDR_length);
385 		icmp_err_ack(q, mp, TBADADDR, 0);
386 		return;
387 	}
388 
389 	error = rawip_do_bind(connp, sa, len);
390 	if (error != 0) {
391 		if (error > 0) {
392 			icmp_err_ack(q, mp, TSYSERR, error);
393 		} else {
394 			icmp_err_ack(q, mp, -error, 0);
395 		}
396 	} else {
397 		tbr->PRIM_type = T_BIND_ACK;
398 		qreply(q, mp);
399 	}
400 }
401 
402 static int
403 rawip_do_bind(conn_t *connp, struct sockaddr *sa, socklen_t len)
404 {
405 	sin_t		*sin;
406 	sin6_t		*sin6;
407 	icmp_t		*icmp = connp->conn_icmp;
408 	int		error = 0;
409 	ip_laddr_t	laddr_type = IPVL_UNICAST_UP;	/* INADDR_ANY */
410 	in_port_t	lport;		/* Network byte order */
411 	ipaddr_t	v4src;		/* Set if AF_INET */
412 	in6_addr_t	v6src;
413 	uint_t		scopeid = 0;
414 	zoneid_t	zoneid = IPCL_ZONEID(connp);
415 	ip_stack_t	*ipst = connp->conn_netstack->netstack_ip;
416 
417 	if (sa == NULL || !OK_32PTR((char *)sa)) {
418 		return (EINVAL);
419 	}
420 
421 	switch (len) {
422 	case sizeof (sin_t):    /* Complete IPv4 address */
423 		sin = (sin_t *)sa;
424 		if (sin->sin_family != AF_INET ||
425 		    connp->conn_family != AF_INET) {
426 			/* TSYSERR, EAFNOSUPPORT */
427 			return (EAFNOSUPPORT);
428 		}
429 		v4src = sin->sin_addr.s_addr;
430 		IN6_IPADDR_TO_V4MAPPED(v4src, &v6src);
431 		if (v4src != INADDR_ANY) {
432 			laddr_type = ip_laddr_verify_v4(v4src, zoneid, ipst,
433 			    B_TRUE);
434 		}
435 		lport = sin->sin_port;
436 		break;
437 	case sizeof (sin6_t): /* Complete IPv6 address */
438 		sin6 = (sin6_t *)sa;
439 		if (sin6->sin6_family != AF_INET6 ||
440 		    connp->conn_family != AF_INET6) {
441 			/* TSYSERR, EAFNOSUPPORT */
442 			return (EAFNOSUPPORT);
443 		}
444 		/* No support for mapped addresses on raw sockets */
445 		if (IN6_IS_ADDR_V4MAPPED(&sin6->sin6_addr)) {
446 			/* TSYSERR, EADDRNOTAVAIL */
447 			return (EADDRNOTAVAIL);
448 		}
449 		v6src = sin6->sin6_addr;
450 		if (!IN6_IS_ADDR_UNSPECIFIED(&v6src)) {
451 			if (IN6_IS_ADDR_LINKSCOPE(&v6src))
452 				scopeid = sin6->sin6_scope_id;
453 			laddr_type = ip_laddr_verify_v6(&v6src, zoneid, ipst,
454 			    B_TRUE, scopeid);
455 		}
456 		lport = sin6->sin6_port;
457 		break;
458 
459 	default:
460 		/* TBADADDR */
461 		return (EADDRNOTAVAIL);
462 	}
463 
464 	/* Is the local address a valid unicast, multicast, or broadcast? */
465 	if (laddr_type == IPVL_BAD)
466 		return (EADDRNOTAVAIL);
467 
468 	/*
469 	 * The state must be TS_UNBND.
470 	 */
471 	mutex_enter(&connp->conn_lock);
472 	if (icmp->icmp_state != TS_UNBND) {
473 		mutex_exit(&connp->conn_lock);
474 		return (-TOUTSTATE);
475 	}
476 
477 	/*
478 	 * Copy the source address into our icmp structure.  This address
479 	 * may still be zero; if so, ip will fill in the correct address
480 	 * each time an outbound packet is passed to it.
481 	 * If we are binding to a broadcast or multicast address then
482 	 * we just set the conn_bound_addr since we don't want to use
483 	 * that as the source address when sending.
484 	 */
485 	connp->conn_bound_addr_v6 = v6src;
486 	connp->conn_laddr_v6 = v6src;
487 	if (scopeid != 0) {
488 		connp->conn_ixa->ixa_flags |= IXAF_SCOPEID_SET;
489 		connp->conn_ixa->ixa_scopeid = scopeid;
490 		connp->conn_incoming_ifindex = scopeid;
491 	} else {
492 		connp->conn_ixa->ixa_flags &= ~IXAF_SCOPEID_SET;
493 		connp->conn_incoming_ifindex = connp->conn_bound_if;
494 	}
495 
496 	switch (laddr_type) {
497 	case IPVL_UNICAST_UP:
498 	case IPVL_UNICAST_DOWN:
499 		connp->conn_saddr_v6 = v6src;
500 		connp->conn_mcbc_bind = B_FALSE;
501 		break;
502 	case IPVL_MCAST:
503 	case IPVL_BCAST:
504 		/* ip_set_destination will pick a source address later */
505 		connp->conn_saddr_v6 = ipv6_all_zeros;
506 		connp->conn_mcbc_bind = B_TRUE;
507 		break;
508 	}
509 
510 	/* Any errors after this point should use late_error */
511 
512 	/*
513 	 * Use sin_port/sin6_port since applications like psh use SOCK_RAW
514 	 * with IPPROTO_TCP.
515 	 */
516 	connp->conn_lport = lport;
517 	connp->conn_fport = 0;
518 
519 	if (connp->conn_family == AF_INET) {
520 		ASSERT(connp->conn_ipversion == IPV4_VERSION);
521 	} else {
522 		ASSERT(connp->conn_ipversion == IPV6_VERSION);
523 	}
524 
525 	icmp->icmp_state = TS_IDLE;
526 
527 	/*
528 	 * We create an initial header template here to make a subsequent
529 	 * sendto have a starting point. Since conn_last_dst is zero the
530 	 * first sendto will always follow the 'dst changed' code path.
531 	 * Note that we defer massaging options and the related checksum
532 	 * adjustment until we have a destination address.
533 	 */
534 	error = icmp_build_hdr_template(connp, &connp->conn_saddr_v6,
535 	    &connp->conn_faddr_v6, connp->conn_flowinfo);
536 	if (error != 0) {
537 		mutex_exit(&connp->conn_lock);
538 		goto late_error;
539 	}
540 	/* Just in case */
541 	connp->conn_faddr_v6 = ipv6_all_zeros;
542 	connp->conn_v6lastdst = ipv6_all_zeros;
543 	mutex_exit(&connp->conn_lock);
544 
545 	error = ip_laddr_fanout_insert(connp);
546 	if (error != 0)
547 		goto late_error;
548 
549 	/* Bind succeeded */
550 	return (0);
551 
552 late_error:
553 	mutex_enter(&connp->conn_lock);
554 	connp->conn_saddr_v6 = ipv6_all_zeros;
555 	connp->conn_bound_addr_v6 = ipv6_all_zeros;
556 	connp->conn_laddr_v6 = ipv6_all_zeros;
557 	if (scopeid != 0) {
558 		connp->conn_ixa->ixa_flags &= ~IXAF_SCOPEID_SET;
559 		connp->conn_incoming_ifindex = connp->conn_bound_if;
560 	}
561 	icmp->icmp_state = TS_UNBND;
562 	connp->conn_v6lastdst = ipv6_all_zeros;
563 	connp->conn_lport = 0;
564 
565 	/* Restore the header that was built above - different source address */
566 	(void) icmp_build_hdr_template(connp, &connp->conn_saddr_v6,
567 	    &connp->conn_faddr_v6, connp->conn_flowinfo);
568 	mutex_exit(&connp->conn_lock);
569 	return (error);
570 }
571 
572 /*
573  * Tell IP to just bind to the protocol.
574  */
575 static void
576 icmp_bind_proto(icmp_t *icmp)
577 {
578 	conn_t	*connp = icmp->icmp_connp;
579 
580 	mutex_enter(&connp->conn_lock);
581 	connp->conn_saddr_v6 = ipv6_all_zeros;
582 	connp->conn_laddr_v6 = ipv6_all_zeros;
583 	connp->conn_faddr_v6 = ipv6_all_zeros;
584 	connp->conn_v6lastdst = ipv6_all_zeros;
585 	mutex_exit(&connp->conn_lock);
586 
587 	(void) ip_laddr_fanout_insert(connp);
588 }
589 
590 /*
591  * This routine handles each T_CONN_REQ message passed to icmp.  It
592  * associates a default destination address with the stream.
593  *
594  * After various error checks are completed, icmp_connect() lays
595  * the target address and port into the composite header template.
596  * Then we ask IP for information, including a source address if we didn't
597  * already have one. Finally we send up the T_OK_ACK reply message.
598  */
599 static void
600 icmp_tpi_connect(queue_t *q, mblk_t *mp)
601 {
602 	conn_t	*connp = Q_TO_CONN(q);
603 	struct T_conn_req	*tcr;
604 	struct sockaddr *sa;
605 	socklen_t len;
606 	int error;
607 	cred_t *cr;
608 	pid_t pid;
609 	/*
610 	 * All Solaris components should pass a db_credp
611 	 * for this TPI message, hence we ASSERT.
612 	 * But in case there is some other M_PROTO that looks
613 	 * like a TPI message sent by some other kernel
614 	 * component, we check and return an error.
615 	 */
616 	cr = msg_getcred(mp, &pid);
617 	ASSERT(cr != NULL);
618 	if (cr == NULL) {
619 		icmp_err_ack(q, mp, TSYSERR, EINVAL);
620 		return;
621 	}
622 
623 	tcr = (struct T_conn_req *)mp->b_rptr;
624 	/* Sanity checks */
625 	if ((mp->b_wptr - mp->b_rptr) < sizeof (struct T_conn_req)) {
626 		icmp_err_ack(q, mp, TPROTO, 0);
627 		return;
628 	}
629 
630 	if (tcr->OPT_length != 0) {
631 		icmp_err_ack(q, mp, TBADOPT, 0);
632 		return;
633 	}
634 
635 	len = tcr->DEST_length;
636 
637 	switch (len) {
638 	default:
639 		icmp_err_ack(q, mp, TBADADDR, 0);
640 		return;
641 	case sizeof (sin_t):
642 		sa = (struct sockaddr *)mi_offset_param(mp, tcr->DEST_offset,
643 		    sizeof (sin_t));
644 		break;
645 	case sizeof (sin6_t):
646 		sa = (struct sockaddr *)mi_offset_param(mp,
647 		    tcr->DEST_offset, sizeof (sin6_t));
648 		break;
649 	}
650 
651 	error = proto_verify_ip_addr(connp->conn_family, sa, len);
652 	if (error != 0) {
653 		icmp_err_ack(q, mp, TSYSERR, error);
654 		return;
655 	}
656 
657 	error = rawip_do_connect(connp, sa, len, cr, pid);
658 	if (error != 0) {
659 		if (error < 0) {
660 			icmp_err_ack(q, mp, -error, 0);
661 		} else {
662 			icmp_err_ack(q, mp, 0, error);
663 		}
664 	} else {
665 		mblk_t *mp1;
666 
667 		/*
668 		 * We have to send a connection confirmation to
669 		 * keep TLI happy.
670 		 */
671 		if (connp->conn_family == AF_INET) {
672 			mp1 = mi_tpi_conn_con(NULL, (char *)sa,
673 			    sizeof (sin_t), NULL, 0);
674 		} else {
675 			ASSERT(connp->conn_family == AF_INET6);
676 			mp1 = mi_tpi_conn_con(NULL, (char *)sa,
677 			    sizeof (sin6_t), NULL, 0);
678 		}
679 		if (mp1 == NULL) {
680 			icmp_err_ack(q, mp, TSYSERR, ENOMEM);
681 			return;
682 		}
683 
684 		/*
685 		 * Send ok_ack for T_CONN_REQ
686 		 */
687 		mp = mi_tpi_ok_ack_alloc(mp);
688 		if (mp == NULL) {
689 			/* Unable to reuse the T_CONN_REQ for the ack. */
690 			icmp_err_ack_prim(q, mp1, T_CONN_REQ, TSYSERR, ENOMEM);
691 			return;
692 		}
693 		putnext(connp->conn_rq, mp);
694 		putnext(connp->conn_rq, mp1);
695 	}
696 }
697 
698 static int
699 rawip_do_connect(conn_t *connp, const struct sockaddr *sa, socklen_t len,
700     cred_t *cr, pid_t pid)
701 {
702 	icmp_t		*icmp;
703 	sin_t		*sin;
704 	sin6_t		*sin6;
705 	int		error;
706 	uint16_t 	dstport;
707 	ipaddr_t	v4dst;
708 	in6_addr_t	v6dst;
709 	uint32_t	flowinfo;
710 	ip_xmit_attr_t	*ixa;
711 	ip_xmit_attr_t	*oldixa;
712 	uint_t		scopeid = 0;
713 	uint_t		srcid = 0;
714 	in6_addr_t	v6src = connp->conn_saddr_v6;
715 
716 	icmp = connp->conn_icmp;
717 
718 	if (sa == NULL || !OK_32PTR((char *)sa)) {
719 		return (EINVAL);
720 	}
721 
722 	ASSERT(sa != NULL && len != 0);
723 
724 	/*
725 	 * Determine packet type based on type of address passed in
726 	 * the request should contain an IPv4 or IPv6 address.
727 	 * Make sure that address family matches the type of
728 	 * family of the address passed down.
729 	 */
730 	switch (len) {
731 	case sizeof (sin_t):
732 		sin = (sin_t *)sa;
733 
734 		v4dst = sin->sin_addr.s_addr;
735 		dstport = sin->sin_port;
736 		IN6_IPADDR_TO_V4MAPPED(v4dst, &v6dst);
737 		ASSERT(connp->conn_ipversion == IPV4_VERSION);
738 		break;
739 
740 	case sizeof (sin6_t):
741 		sin6 = (sin6_t *)sa;
742 
743 		/* No support for mapped addresses on raw sockets */
744 		if (IN6_IS_ADDR_V4MAPPED(&sin6->sin6_addr)) {
745 			return (EADDRNOTAVAIL);
746 		}
747 		v6dst = sin6->sin6_addr;
748 		dstport = sin6->sin6_port;
749 		ASSERT(connp->conn_ipversion == IPV6_VERSION);
750 		flowinfo = sin6->sin6_flowinfo;
751 		if (IN6_IS_ADDR_LINKLOCAL(&sin6->sin6_addr))
752 			scopeid = sin6->sin6_scope_id;
753 		srcid = sin6->__sin6_src_id;
754 		if (srcid != 0 && IN6_IS_ADDR_UNSPECIFIED(&v6src)) {
755 			ip_srcid_find_id(srcid, &v6src, IPCL_ZONEID(connp),
756 			    connp->conn_netstack);
757 		}
758 		break;
759 	}
760 
761 	/*
762 	 * If there is a different thread using conn_ixa then we get a new
763 	 * copy and cut the old one loose from conn_ixa. Otherwise we use
764 	 * conn_ixa and prevent any other thread from using/changing it.
765 	 * Once connect() is done other threads can use conn_ixa since the
766 	 * refcnt will be back at one.
767 	 * We defer updating conn_ixa until later to handle any concurrent
768 	 * conn_ixa_cleanup thread.
769 	 */
770 	ixa = conn_get_ixa(connp, B_FALSE);
771 	if (ixa == NULL)
772 		return (ENOMEM);
773 
774 	ASSERT(ixa->ixa_refcnt >= 2);
775 	ASSERT(ixa == connp->conn_ixa);
776 
777 	mutex_enter(&connp->conn_lock);
778 	/*
779 	 * This icmp_t must have bound already before doing a connect.
780 	 * Reject if a connect is in progress (we drop conn_lock during
781 	 * rawip_do_connect).
782 	 */
783 	if (icmp->icmp_state == TS_UNBND || icmp->icmp_state == TS_WCON_CREQ) {
784 		mutex_exit(&connp->conn_lock);
785 		ixa_refrele(ixa);
786 		return (-TOUTSTATE);
787 	}
788 
789 	if (icmp->icmp_state == TS_DATA_XFER) {
790 		/* Already connected - clear out state */
791 		if (connp->conn_mcbc_bind)
792 			connp->conn_saddr_v6 = ipv6_all_zeros;
793 		else
794 			connp->conn_saddr_v6 = connp->conn_bound_addr_v6;
795 		connp->conn_laddr_v6 = connp->conn_bound_addr_v6;
796 		connp->conn_faddr_v6 = ipv6_all_zeros;
797 		icmp->icmp_state = TS_IDLE;
798 	}
799 
800 	/*
801 	 * Use sin_port/sin6_port since applications like psh use SOCK_RAW
802 	 * with IPPROTO_TCP.
803 	 */
804 	connp->conn_fport = dstport;
805 	if (connp->conn_ipversion == IPV4_VERSION) {
806 		/*
807 		 * Interpret a zero destination to mean loopback.
808 		 * Update the T_CONN_REQ (sin/sin6) since it is used to
809 		 * generate the T_CONN_CON.
810 		 */
811 		if (v4dst == INADDR_ANY) {
812 			v4dst = htonl(INADDR_LOOPBACK);
813 			IN6_IPADDR_TO_V4MAPPED(v4dst, &v6dst);
814 			ASSERT(connp->conn_family == AF_INET);
815 			sin->sin_addr.s_addr = v4dst;
816 		}
817 		connp->conn_faddr_v6 = v6dst;
818 		connp->conn_flowinfo = 0;
819 	} else {
820 		ASSERT(connp->conn_ipversion == IPV6_VERSION);
821 		/*
822 		 * Interpret a zero destination to mean loopback.
823 		 * Update the T_CONN_REQ (sin/sin6) since it is used to
824 		 * generate the T_CONN_CON.
825 		 */
826 		if (IN6_IS_ADDR_UNSPECIFIED(&v6dst)) {
827 			v6dst = ipv6_loopback;
828 			sin6->sin6_addr = v6dst;
829 		}
830 		connp->conn_faddr_v6 = v6dst;
831 		connp->conn_flowinfo = flowinfo;
832 	}
833 
834 	/*
835 	 * We update our cred/cpid based on the caller of connect
836 	 */
837 	if (connp->conn_cred != cr) {
838 		crhold(cr);
839 		crfree(connp->conn_cred);
840 		connp->conn_cred = cr;
841 	}
842 	connp->conn_cpid = pid;
843 	ASSERT(!(ixa->ixa_free_flags & IXA_FREE_CRED));
844 	ixa->ixa_cred = cr;
845 	ixa->ixa_cpid = pid;
846 	if (is_system_labeled()) {
847 		/* We need to restart with a label based on the cred */
848 		ip_xmit_attr_restore_tsl(ixa, ixa->ixa_cred);
849 	}
850 
851 	if (scopeid != 0) {
852 		ixa->ixa_flags |= IXAF_SCOPEID_SET;
853 		ixa->ixa_scopeid = scopeid;
854 		connp->conn_incoming_ifindex = scopeid;
855 	} else {
856 		ixa->ixa_flags &= ~IXAF_SCOPEID_SET;
857 		connp->conn_incoming_ifindex = connp->conn_bound_if;
858 	}
859 
860 	/*
861 	 * conn_connect will drop conn_lock and reacquire it.
862 	 * To prevent a send* from messing with this icmp_t while the lock
863 	 * is dropped we set icmp_state and clear conn_v6lastdst.
864 	 * That will make all send* fail with EISCONN.
865 	 */
866 	connp->conn_v6lastdst = ipv6_all_zeros;
867 	icmp->icmp_state = TS_WCON_CREQ;
868 
869 	error = conn_connect(connp, NULL, IPDF_ALLOW_MCBC);
870 	mutex_exit(&connp->conn_lock);
871 	if (error != 0)
872 		goto connect_failed;
873 
874 	/*
875 	 * The addresses have been verified. Time to insert in
876 	 * the correct fanout list.
877 	 */
878 	error = ipcl_conn_insert(connp);
879 	if (error != 0)
880 		goto connect_failed;
881 
882 	mutex_enter(&connp->conn_lock);
883 	error = icmp_build_hdr_template(connp, &connp->conn_saddr_v6,
884 	    &connp->conn_faddr_v6, connp->conn_flowinfo);
885 	if (error != 0) {
886 		mutex_exit(&connp->conn_lock);
887 		goto connect_failed;
888 	}
889 
890 	icmp->icmp_state = TS_DATA_XFER;
891 	/* Record this as the "last" send even though we haven't sent any */
892 	connp->conn_v6lastdst = connp->conn_faddr_v6;
893 	connp->conn_lastipversion = connp->conn_ipversion;
894 	connp->conn_lastdstport = connp->conn_fport;
895 	connp->conn_lastflowinfo = connp->conn_flowinfo;
896 	connp->conn_lastscopeid = scopeid;
897 	connp->conn_lastsrcid = srcid;
898 	/* Also remember a source to use together with lastdst */
899 	connp->conn_v6lastsrc = v6src;
900 
901 	oldixa = conn_replace_ixa(connp, ixa);
902 	mutex_exit(&connp->conn_lock);
903 	ixa_refrele(oldixa);
904 
905 	ixa_refrele(ixa);
906 	return (0);
907 
908 connect_failed:
909 	if (ixa != NULL)
910 		ixa_refrele(ixa);
911 	mutex_enter(&connp->conn_lock);
912 	icmp->icmp_state = TS_IDLE;
913 	/* In case the source address was set above */
914 	if (connp->conn_mcbc_bind)
915 		connp->conn_saddr_v6 = ipv6_all_zeros;
916 	else
917 		connp->conn_saddr_v6 = connp->conn_bound_addr_v6;
918 	connp->conn_laddr_v6 = connp->conn_bound_addr_v6;
919 	connp->conn_faddr_v6 = ipv6_all_zeros;
920 	connp->conn_v6lastdst = ipv6_all_zeros;
921 	connp->conn_flowinfo = 0;
922 
923 	(void) icmp_build_hdr_template(connp, &connp->conn_saddr_v6,
924 	    &connp->conn_faddr_v6, connp->conn_flowinfo);
925 	mutex_exit(&connp->conn_lock);
926 	return (error);
927 }
928 
929 static void
930 rawip_do_close(conn_t *connp)
931 {
932 	ASSERT(connp != NULL && IPCL_IS_RAWIP(connp));
933 
934 	ip_quiesce_conn(connp);
935 
936 	if (!IPCL_IS_NONSTR(connp)) {
937 		qprocsoff(connp->conn_rq);
938 	}
939 
940 	icmp_close_free(connp);
941 
942 	/*
943 	 * Now we are truly single threaded on this stream, and can
944 	 * delete the things hanging off the connp, and finally the connp.
945 	 * We removed this connp from the fanout list, it cannot be
946 	 * accessed thru the fanouts, and we already waited for the
947 	 * conn_ref to drop to 0. We are already in close, so
948 	 * there cannot be any other thread from the top. qprocsoff
949 	 * has completed, and service has completed or won't run in
950 	 * future.
951 	 */
952 	ASSERT(connp->conn_ref == 1);
953 
954 	if (!IPCL_IS_NONSTR(connp)) {
955 		inet_minor_free(connp->conn_minor_arena, connp->conn_dev);
956 	} else {
957 		ip_free_helper_stream(connp);
958 	}
959 
960 	connp->conn_ref--;
961 	ipcl_conn_destroy(connp);
962 }
963 
964 static int
965 icmp_close(queue_t *q, int flags)
966 {
967 	conn_t  *connp;
968 
969 	if (flags & SO_FALLBACK) {
970 		/*
971 		 * stream is being closed while in fallback
972 		 * simply free the resources that were allocated
973 		 */
974 		inet_minor_free(WR(q)->q_ptr, (dev_t)(RD(q)->q_ptr));
975 		qprocsoff(q);
976 		goto done;
977 	}
978 
979 	connp = Q_TO_CONN(q);
980 	(void) rawip_do_close(connp);
981 done:
982 	q->q_ptr = WR(q)->q_ptr = NULL;
983 	return (0);
984 }
985 
986 static void
987 icmp_close_free(conn_t *connp)
988 {
989 	icmp_t *icmp = connp->conn_icmp;
990 
991 	if (icmp->icmp_filter != NULL) {
992 		kmem_free(icmp->icmp_filter, sizeof (icmp6_filter_t));
993 		icmp->icmp_filter = NULL;
994 	}
995 
996 	/*
997 	 * Clear any fields which the kmem_cache constructor clears.
998 	 * Only icmp_connp needs to be preserved.
999 	 * TBD: We should make this more efficient to avoid clearing
1000 	 * everything.
1001 	 */
1002 	ASSERT(icmp->icmp_connp == connp);
1003 	bzero(icmp, sizeof (icmp_t));
1004 	icmp->icmp_connp = connp;
1005 }
1006 
1007 /*
1008  * This routine handles each T_DISCON_REQ message passed to icmp
1009  * as an indicating that ICMP is no longer connected. This results
1010  * in telling IP to restore the binding to just the local address.
1011  */
1012 static int
1013 icmp_do_disconnect(conn_t *connp)
1014 {
1015 	icmp_t	*icmp = connp->conn_icmp;
1016 	int	error;
1017 
1018 	mutex_enter(&connp->conn_lock);
1019 	if (icmp->icmp_state != TS_DATA_XFER) {
1020 		mutex_exit(&connp->conn_lock);
1021 		return (-TOUTSTATE);
1022 	}
1023 	if (connp->conn_mcbc_bind)
1024 		connp->conn_saddr_v6 = ipv6_all_zeros;
1025 	else
1026 		connp->conn_saddr_v6 = connp->conn_bound_addr_v6;
1027 	connp->conn_laddr_v6 = connp->conn_bound_addr_v6;
1028 	connp->conn_faddr_v6 = ipv6_all_zeros;
1029 	icmp->icmp_state = TS_IDLE;
1030 
1031 	connp->conn_v6lastdst = ipv6_all_zeros;
1032 	error = icmp_build_hdr_template(connp, &connp->conn_saddr_v6,
1033 	    &connp->conn_faddr_v6, connp->conn_flowinfo);
1034 	mutex_exit(&connp->conn_lock);
1035 	if (error != 0)
1036 		return (error);
1037 
1038 	/*
1039 	 * Tell IP to remove the full binding and revert
1040 	 * to the local address binding.
1041 	 */
1042 	return (ip_laddr_fanout_insert(connp));
1043 }
1044 
1045 static void
1046 icmp_tpi_disconnect(queue_t *q, mblk_t *mp)
1047 {
1048 	conn_t	*connp = Q_TO_CONN(q);
1049 	int	error;
1050 
1051 	/*
1052 	 * Allocate the largest primitive we need to send back
1053 	 * T_error_ack is > than T_ok_ack
1054 	 */
1055 	mp = reallocb(mp, sizeof (struct T_error_ack), 1);
1056 	if (mp == NULL) {
1057 		/* Unable to reuse the T_DISCON_REQ for the ack. */
1058 		icmp_err_ack_prim(q, mp, T_DISCON_REQ, TSYSERR, ENOMEM);
1059 		return;
1060 	}
1061 
1062 	error = icmp_do_disconnect(connp);
1063 
1064 	if (error != 0) {
1065 		if (error > 0) {
1066 			icmp_err_ack(q, mp, 0, error);
1067 		} else {
1068 			icmp_err_ack(q, mp, -error, 0);
1069 		}
1070 	} else {
1071 		mp = mi_tpi_ok_ack_alloc(mp);
1072 		ASSERT(mp != NULL);
1073 		qreply(q, mp);
1074 	}
1075 }
1076 
1077 static int
1078 icmp_disconnect(conn_t *connp)
1079 {
1080 	int	error;
1081 
1082 	connp->conn_dgram_errind = B_FALSE;
1083 
1084 	error = icmp_do_disconnect(connp);
1085 
1086 	if (error < 0)
1087 		error = proto_tlitosyserr(-error);
1088 	return (error);
1089 }
1090 
1091 /* This routine creates a T_ERROR_ACK message and passes it upstream. */
1092 static void
1093 icmp_err_ack(queue_t *q, mblk_t *mp, t_scalar_t t_error, int sys_error)
1094 {
1095 	if ((mp = mi_tpi_err_ack_alloc(mp, t_error, sys_error)) != NULL)
1096 		qreply(q, mp);
1097 }
1098 
1099 /* Shorthand to generate and send TPI error acks to our client */
1100 static void
1101 icmp_err_ack_prim(queue_t *q, mblk_t *mp, t_scalar_t primitive,
1102     t_scalar_t t_error, int sys_error)
1103 {
1104 	struct T_error_ack	*teackp;
1105 
1106 	if ((mp = tpi_ack_alloc(mp, sizeof (struct T_error_ack),
1107 	    M_PCPROTO, T_ERROR_ACK)) != NULL) {
1108 		teackp = (struct T_error_ack *)mp->b_rptr;
1109 		teackp->ERROR_prim = primitive;
1110 		teackp->TLI_error = t_error;
1111 		teackp->UNIX_error = sys_error;
1112 		qreply(q, mp);
1113 	}
1114 }
1115 
1116 /*
1117  * icmp_icmp_input is called as conn_recvicmp to process ICMP messages.
1118  * Generates the appropriate T_UDERROR_IND for permanent (non-transient) errors.
1119  * Assumes that IP has pulled up everything up to and including the ICMP header.
1120  */
1121 /* ARGSUSED2 */
1122 static void
1123 icmp_icmp_input(void *arg1, mblk_t *mp, void *arg2, ip_recv_attr_t *ira)
1124 {
1125 	conn_t		*connp = (conn_t *)arg1;
1126 	icmp_t		*icmp = connp->conn_icmp;
1127 	icmph_t		*icmph;
1128 	ipha_t		*ipha;
1129 	int		iph_hdr_length;
1130 	sin_t		sin;
1131 	mblk_t		*mp1;
1132 	int		error = 0;
1133 
1134 	ipha = (ipha_t *)mp->b_rptr;
1135 
1136 	ASSERT(OK_32PTR(mp->b_rptr));
1137 
1138 	if (IPH_HDR_VERSION(ipha) != IPV4_VERSION) {
1139 		ASSERT(IPH_HDR_VERSION(ipha) == IPV6_VERSION);
1140 		icmp_icmp_error_ipv6(connp, mp, ira);
1141 		return;
1142 	}
1143 	ASSERT(IPH_HDR_VERSION(ipha) == IPV4_VERSION);
1144 
1145 	/* Skip past the outer IP and ICMP headers */
1146 	ASSERT(IPH_HDR_LENGTH(ipha) == ira->ira_ip_hdr_length);
1147 	iph_hdr_length = ira->ira_ip_hdr_length;
1148 	icmph = (icmph_t *)&mp->b_rptr[iph_hdr_length];
1149 	ipha = (ipha_t *)&icmph[1];	/* Inner IP header */
1150 
1151 	iph_hdr_length = IPH_HDR_LENGTH(ipha);
1152 
1153 	switch (icmph->icmph_type) {
1154 	case ICMP_DEST_UNREACHABLE:
1155 		switch (icmph->icmph_code) {
1156 		case ICMP_FRAGMENTATION_NEEDED: {
1157 			ipha_t		*ipha;
1158 			ip_xmit_attr_t	*ixa;
1159 			/*
1160 			 * IP has already adjusted the path MTU.
1161 			 * But we need to adjust DF for IPv4.
1162 			 */
1163 			if (connp->conn_ipversion != IPV4_VERSION)
1164 				break;
1165 
1166 			ixa = conn_get_ixa(connp, B_FALSE);
1167 			if (ixa == NULL || ixa->ixa_ire == NULL) {
1168 				/*
1169 				 * Some other thread holds conn_ixa. We will
1170 				 * redo this on the next ICMP too big.
1171 				 */
1172 				if (ixa != NULL)
1173 					ixa_refrele(ixa);
1174 				break;
1175 			}
1176 			(void) ip_get_pmtu(ixa);
1177 
1178 			mutex_enter(&connp->conn_lock);
1179 			ipha = (ipha_t *)connp->conn_ht_iphc;
1180 			if (ixa->ixa_flags & IXAF_PMTU_IPV4_DF) {
1181 				ipha->ipha_fragment_offset_and_flags |=
1182 				    IPH_DF_HTONS;
1183 			} else {
1184 				ipha->ipha_fragment_offset_and_flags &=
1185 				    ~IPH_DF_HTONS;
1186 			}
1187 			mutex_exit(&connp->conn_lock);
1188 			ixa_refrele(ixa);
1189 			break;
1190 		}
1191 		case ICMP_PORT_UNREACHABLE:
1192 		case ICMP_PROTOCOL_UNREACHABLE:
1193 			error = ECONNREFUSED;
1194 			break;
1195 		default:
1196 			/* Transient errors */
1197 			break;
1198 		}
1199 		break;
1200 	default:
1201 		/* Transient errors */
1202 		break;
1203 	}
1204 	if (error == 0) {
1205 		freemsg(mp);
1206 		return;
1207 	}
1208 
1209 	/*
1210 	 * Deliver T_UDERROR_IND when the application has asked for it.
1211 	 * The socket layer enables this automatically when connected.
1212 	 */
1213 	if (!connp->conn_dgram_errind) {
1214 		freemsg(mp);
1215 		return;
1216 	}
1217 
1218 	sin = sin_null;
1219 	sin.sin_family = AF_INET;
1220 	sin.sin_addr.s_addr = ipha->ipha_dst;
1221 
1222 	if (IPCL_IS_NONSTR(connp)) {
1223 		mutex_enter(&connp->conn_lock);
1224 		if (icmp->icmp_state == TS_DATA_XFER) {
1225 			if (sin.sin_addr.s_addr == connp->conn_faddr_v4) {
1226 				mutex_exit(&connp->conn_lock);
1227 				(*connp->conn_upcalls->su_set_error)
1228 				    (connp->conn_upper_handle, error);
1229 				goto done;
1230 			}
1231 		} else {
1232 			icmp->icmp_delayed_error = error;
1233 			*((sin_t *)&icmp->icmp_delayed_addr) = sin;
1234 		}
1235 		mutex_exit(&connp->conn_lock);
1236 	} else {
1237 		mp1 = mi_tpi_uderror_ind((char *)&sin, sizeof (sin_t), NULL, 0,
1238 		    error);
1239 		if (mp1 != NULL)
1240 			putnext(connp->conn_rq, mp1);
1241 	}
1242 done:
1243 	freemsg(mp);
1244 }
1245 
1246 /*
1247  * icmp_icmp_error_ipv6 is called by icmp_icmp_error to process ICMP for IPv6.
1248  * Generates the appropriate T_UDERROR_IND for permanent (non-transient) errors.
1249  * Assumes that IP has pulled up all the extension headers as well as the
1250  * ICMPv6 header.
1251  */
1252 static void
1253 icmp_icmp_error_ipv6(conn_t *connp, mblk_t *mp, ip_recv_attr_t *ira)
1254 {
1255 	icmp6_t		*icmp6;
1256 	ip6_t		*ip6h, *outer_ip6h;
1257 	uint16_t	iph_hdr_length;
1258 	uint8_t		*nexthdrp;
1259 	sin6_t		sin6;
1260 	mblk_t		*mp1;
1261 	int		error = 0;
1262 	icmp_t		*icmp = connp->conn_icmp;
1263 
1264 	outer_ip6h = (ip6_t *)mp->b_rptr;
1265 #ifdef DEBUG
1266 	if (outer_ip6h->ip6_nxt != IPPROTO_ICMPV6)
1267 		iph_hdr_length = ip_hdr_length_v6(mp, outer_ip6h);
1268 	else
1269 		iph_hdr_length = IPV6_HDR_LEN;
1270 	ASSERT(iph_hdr_length == ira->ira_ip_hdr_length);
1271 #endif
1272 	/* Skip past the outer IP and ICMP headers */
1273 	iph_hdr_length = ira->ira_ip_hdr_length;
1274 	icmp6 = (icmp6_t *)&mp->b_rptr[iph_hdr_length];
1275 
1276 	ip6h = (ip6_t *)&icmp6[1];	/* Inner IP header */
1277 	if (!ip_hdr_length_nexthdr_v6(mp, ip6h, &iph_hdr_length, &nexthdrp)) {
1278 		freemsg(mp);
1279 		return;
1280 	}
1281 
1282 	switch (icmp6->icmp6_type) {
1283 	case ICMP6_DST_UNREACH:
1284 		switch (icmp6->icmp6_code) {
1285 		case ICMP6_DST_UNREACH_NOPORT:
1286 			error = ECONNREFUSED;
1287 			break;
1288 		case ICMP6_DST_UNREACH_ADMIN:
1289 		case ICMP6_DST_UNREACH_NOROUTE:
1290 		case ICMP6_DST_UNREACH_BEYONDSCOPE:
1291 		case ICMP6_DST_UNREACH_ADDR:
1292 			/* Transient errors */
1293 			break;
1294 		default:
1295 			break;
1296 		}
1297 		break;
1298 	case ICMP6_PACKET_TOO_BIG: {
1299 		struct T_unitdata_ind	*tudi;
1300 		struct T_opthdr		*toh;
1301 		size_t			udi_size;
1302 		mblk_t			*newmp;
1303 		t_scalar_t		opt_length = sizeof (struct T_opthdr) +
1304 		    sizeof (struct ip6_mtuinfo);
1305 		sin6_t			*sin6;
1306 		struct ip6_mtuinfo	*mtuinfo;
1307 
1308 		/*
1309 		 * If the application has requested to receive path mtu
1310 		 * information, send up an empty message containing an
1311 		 * IPV6_PATHMTU ancillary data item.
1312 		 */
1313 		if (!connp->conn_ipv6_recvpathmtu)
1314 			break;
1315 
1316 		udi_size = sizeof (struct T_unitdata_ind) + sizeof (sin6_t) +
1317 		    opt_length;
1318 		if ((newmp = allocb(udi_size, BPRI_MED)) == NULL) {
1319 			BUMP_MIB(&icmp->icmp_is->is_rawip_mib, rawipInErrors);
1320 			break;
1321 		}
1322 
1323 		/*
1324 		 * newmp->b_cont is left to NULL on purpose.  This is an
1325 		 * empty message containing only ancillary data.
1326 		 */
1327 		newmp->b_datap->db_type = M_PROTO;
1328 		tudi = (struct T_unitdata_ind *)newmp->b_rptr;
1329 		newmp->b_wptr = (uchar_t *)tudi + udi_size;
1330 		tudi->PRIM_type = T_UNITDATA_IND;
1331 		tudi->SRC_length = sizeof (sin6_t);
1332 		tudi->SRC_offset = sizeof (struct T_unitdata_ind);
1333 		tudi->OPT_offset = tudi->SRC_offset + sizeof (sin6_t);
1334 		tudi->OPT_length = opt_length;
1335 
1336 		sin6 = (sin6_t *)&tudi[1];
1337 		bzero(sin6, sizeof (sin6_t));
1338 		sin6->sin6_family = AF_INET6;
1339 		sin6->sin6_addr = connp->conn_faddr_v6;
1340 
1341 		toh = (struct T_opthdr *)&sin6[1];
1342 		toh->level = IPPROTO_IPV6;
1343 		toh->name = IPV6_PATHMTU;
1344 		toh->len = opt_length;
1345 		toh->status = 0;
1346 
1347 		mtuinfo = (struct ip6_mtuinfo *)&toh[1];
1348 		bzero(mtuinfo, sizeof (struct ip6_mtuinfo));
1349 		mtuinfo->ip6m_addr.sin6_family = AF_INET6;
1350 		mtuinfo->ip6m_addr.sin6_addr = ip6h->ip6_dst;
1351 		mtuinfo->ip6m_mtu = icmp6->icmp6_mtu;
1352 		/*
1353 		 * We've consumed everything we need from the original
1354 		 * message.  Free it, then send our empty message.
1355 		 */
1356 		freemsg(mp);
1357 		icmp_ulp_recv(connp, newmp, msgdsize(newmp));
1358 		return;
1359 	}
1360 	case ICMP6_TIME_EXCEEDED:
1361 		/* Transient errors */
1362 		break;
1363 	case ICMP6_PARAM_PROB:
1364 		/* If this corresponds to an ICMP_PROTOCOL_UNREACHABLE */
1365 		if (icmp6->icmp6_code == ICMP6_PARAMPROB_NEXTHEADER &&
1366 		    (uchar_t *)ip6h + icmp6->icmp6_pptr ==
1367 		    (uchar_t *)nexthdrp) {
1368 			error = ECONNREFUSED;
1369 			break;
1370 		}
1371 		break;
1372 	}
1373 	if (error == 0) {
1374 		freemsg(mp);
1375 		return;
1376 	}
1377 
1378 	/*
1379 	 * Deliver T_UDERROR_IND when the application has asked for it.
1380 	 * The socket layer enables this automatically when connected.
1381 	 */
1382 	if (!connp->conn_dgram_errind) {
1383 		freemsg(mp);
1384 		return;
1385 	}
1386 
1387 	sin6 = sin6_null;
1388 	sin6.sin6_family = AF_INET6;
1389 	sin6.sin6_addr = ip6h->ip6_dst;
1390 	sin6.sin6_flowinfo = ip6h->ip6_vcf & ~IPV6_VERS_AND_FLOW_MASK;
1391 	if (IPCL_IS_NONSTR(connp)) {
1392 		mutex_enter(&connp->conn_lock);
1393 		if (icmp->icmp_state == TS_DATA_XFER) {
1394 			if (IN6_ARE_ADDR_EQUAL(&sin6.sin6_addr,
1395 			    &connp->conn_faddr_v6)) {
1396 				mutex_exit(&connp->conn_lock);
1397 				(*connp->conn_upcalls->su_set_error)
1398 				    (connp->conn_upper_handle, error);
1399 				goto done;
1400 			}
1401 		} else {
1402 			icmp->icmp_delayed_error = error;
1403 			*((sin6_t *)&icmp->icmp_delayed_addr) = sin6;
1404 		}
1405 		mutex_exit(&connp->conn_lock);
1406 	} else {
1407 		mp1 = mi_tpi_uderror_ind((char *)&sin6, sizeof (sin6_t),
1408 		    NULL, 0, error);
1409 		if (mp1 != NULL)
1410 			putnext(connp->conn_rq, mp1);
1411 	}
1412 done:
1413 	freemsg(mp);
1414 }
1415 
1416 /*
1417  * This routine responds to T_ADDR_REQ messages.  It is called by icmp_wput.
1418  * The local address is filled in if endpoint is bound. The remote address
1419  * is filled in if remote address has been precified ("connected endpoint")
1420  * (The concept of connected CLTS sockets is alien to published TPI
1421  *  but we support it anyway).
1422  */
1423 static void
1424 icmp_addr_req(queue_t *q, mblk_t *mp)
1425 {
1426 	struct sockaddr *sa;
1427 	mblk_t	*ackmp;
1428 	struct T_addr_ack *taa;
1429 	icmp_t	*icmp = Q_TO_ICMP(q);
1430 	conn_t	*connp = icmp->icmp_connp;
1431 	uint_t	addrlen;
1432 
1433 	/* Make it large enough for worst case */
1434 	ackmp = reallocb(mp, sizeof (struct T_addr_ack) +
1435 	    2 * sizeof (sin6_t), 1);
1436 	if (ackmp == NULL) {
1437 		icmp_err_ack(q, mp, TSYSERR, ENOMEM);
1438 		return;
1439 	}
1440 	taa = (struct T_addr_ack *)ackmp->b_rptr;
1441 
1442 	bzero(taa, sizeof (struct T_addr_ack));
1443 	ackmp->b_wptr = (uchar_t *)&taa[1];
1444 
1445 	taa->PRIM_type = T_ADDR_ACK;
1446 	ackmp->b_datap->db_type = M_PCPROTO;
1447 
1448 	if (connp->conn_family == AF_INET)
1449 		addrlen = sizeof (sin_t);
1450 	else
1451 		addrlen = sizeof (sin6_t);
1452 
1453 	mutex_enter(&connp->conn_lock);
1454 	/*
1455 	 * Note: Following code assumes 32 bit alignment of basic
1456 	 * data structures like sin_t and struct T_addr_ack.
1457 	 */
1458 	if (icmp->icmp_state != TS_UNBND) {
1459 		/*
1460 		 * Fill in local address first
1461 		 */
1462 		taa->LOCADDR_offset = sizeof (*taa);
1463 		taa->LOCADDR_length = addrlen;
1464 		sa = (struct sockaddr *)&taa[1];
1465 		(void) conn_getsockname(connp, sa, &addrlen);
1466 		ackmp->b_wptr += addrlen;
1467 	}
1468 	if (icmp->icmp_state == TS_DATA_XFER) {
1469 		/*
1470 		 * connected, fill remote address too
1471 		 */
1472 		taa->REMADDR_length = addrlen;
1473 		/* assumed 32-bit alignment */
1474 		taa->REMADDR_offset = taa->LOCADDR_offset + taa->LOCADDR_length;
1475 		sa = (struct sockaddr *)(ackmp->b_rptr + taa->REMADDR_offset);
1476 		(void) conn_getpeername(connp, sa, &addrlen);
1477 		ackmp->b_wptr += addrlen;
1478 	}
1479 	mutex_exit(&connp->conn_lock);
1480 	ASSERT(ackmp->b_wptr <= ackmp->b_datap->db_lim);
1481 	qreply(q, ackmp);
1482 }
1483 
1484 static void
1485 icmp_copy_info(struct T_info_ack *tap, icmp_t *icmp)
1486 {
1487 	conn_t		*connp = icmp->icmp_connp;
1488 
1489 	*tap = icmp_g_t_info_ack;
1490 
1491 	if (connp->conn_family == AF_INET6)
1492 		tap->ADDR_size = sizeof (sin6_t);
1493 	else
1494 		tap->ADDR_size = sizeof (sin_t);
1495 	tap->CURRENT_state = icmp->icmp_state;
1496 	tap->OPT_size = icmp_max_optsize;
1497 }
1498 
1499 static void
1500 icmp_do_capability_ack(icmp_t *icmp, struct T_capability_ack *tcap,
1501     t_uscalar_t cap_bits1)
1502 {
1503 	tcap->CAP_bits1 = 0;
1504 
1505 	if (cap_bits1 & TC1_INFO) {
1506 		icmp_copy_info(&tcap->INFO_ack, icmp);
1507 		tcap->CAP_bits1 |= TC1_INFO;
1508 	}
1509 }
1510 
1511 /*
1512  * This routine responds to T_CAPABILITY_REQ messages.  It is called by
1513  * icmp_wput.  Much of the T_CAPABILITY_ACK information is copied from
1514  * icmp_g_t_info_ack.  The current state of the stream is copied from
1515  * icmp_state.
1516  */
1517 static void
1518 icmp_capability_req(queue_t *q, mblk_t *mp)
1519 {
1520 	icmp_t			*icmp = Q_TO_ICMP(q);
1521 	t_uscalar_t		cap_bits1;
1522 	struct T_capability_ack	*tcap;
1523 
1524 	cap_bits1 = ((struct T_capability_req *)mp->b_rptr)->CAP_bits1;
1525 
1526 	mp = tpi_ack_alloc(mp, sizeof (struct T_capability_ack),
1527 	    mp->b_datap->db_type, T_CAPABILITY_ACK);
1528 	if (!mp)
1529 		return;
1530 
1531 	tcap = (struct T_capability_ack *)mp->b_rptr;
1532 
1533 	icmp_do_capability_ack(icmp, tcap, cap_bits1);
1534 
1535 	qreply(q, mp);
1536 }
1537 
1538 /*
1539  * This routine responds to T_INFO_REQ messages.  It is called by icmp_wput.
1540  * Most of the T_INFO_ACK information is copied from icmp_g_t_info_ack.
1541  * The current state of the stream is copied from icmp_state.
1542  */
1543 static void
1544 icmp_info_req(queue_t *q, mblk_t *mp)
1545 {
1546 	icmp_t	*icmp = Q_TO_ICMP(q);
1547 
1548 	/* Create a T_INFO_ACK message. */
1549 	mp = tpi_ack_alloc(mp, sizeof (struct T_info_ack), M_PCPROTO,
1550 	    T_INFO_ACK);
1551 	if (!mp)
1552 		return;
1553 	icmp_copy_info((struct T_info_ack *)mp->b_rptr, icmp);
1554 	qreply(q, mp);
1555 }
1556 
1557 static int
1558 icmp_tpi_open(queue_t *q, dev_t *devp, int flag, int sflag, cred_t *credp,
1559     int family)
1560 {
1561 	conn_t *connp;
1562 	dev_t	conn_dev;
1563 	int	error;
1564 
1565 	/* If the stream is already open, return immediately. */
1566 	if (q->q_ptr != NULL)
1567 		return (0);
1568 
1569 	if (sflag == MODOPEN)
1570 		return (EINVAL);
1571 
1572 	/*
1573 	 * Since ICMP is not used so heavily, allocating from the small
1574 	 * arena should be sufficient.
1575 	 */
1576 	if ((conn_dev = inet_minor_alloc(ip_minor_arena_sa)) == 0) {
1577 		return (EBUSY);
1578 	}
1579 
1580 	if (flag & SO_FALLBACK) {
1581 		/*
1582 		 * Non streams socket needs a stream to fallback to
1583 		 */
1584 		RD(q)->q_ptr = (void *)conn_dev;
1585 		WR(q)->q_qinfo = &icmp_fallback_sock_winit;
1586 		WR(q)->q_ptr = (void *)ip_minor_arena_sa;
1587 		qprocson(q);
1588 		return (0);
1589 	}
1590 
1591 	connp = rawip_do_open(family, credp, &error, KM_SLEEP);
1592 	if (connp == NULL) {
1593 		ASSERT(error != 0);
1594 		inet_minor_free(ip_minor_arena_sa, connp->conn_dev);
1595 		return (error);
1596 	}
1597 
1598 	*devp = makedevice(getemajor(*devp), (minor_t)conn_dev);
1599 	connp->conn_dev = conn_dev;
1600 	connp->conn_minor_arena = ip_minor_arena_sa;
1601 
1602 	/*
1603 	 * Initialize the icmp_t structure for this stream.
1604 	 */
1605 	q->q_ptr = connp;
1606 	WR(q)->q_ptr = connp;
1607 	connp->conn_rq = q;
1608 	connp->conn_wq = WR(q);
1609 
1610 	WR(q)->q_hiwat = connp->conn_sndbuf;
1611 	WR(q)->q_lowat = connp->conn_sndlowat;
1612 
1613 	qprocson(q);
1614 
1615 	/* Set the Stream head write offset. */
1616 	(void) proto_set_tx_wroff(q, connp, connp->conn_wroff);
1617 	(void) proto_set_rx_hiwat(connp->conn_rq, connp, connp->conn_rcvbuf);
1618 
1619 	mutex_enter(&connp->conn_lock);
1620 	connp->conn_state_flags &= ~CONN_INCIPIENT;
1621 	mutex_exit(&connp->conn_lock);
1622 
1623 	icmp_bind_proto(connp->conn_icmp);
1624 
1625 	return (0);
1626 }
1627 
1628 /* For /dev/icmp aka AF_INET open */
1629 static int
1630 icmp_openv4(queue_t *q, dev_t *devp, int flag, int sflag, cred_t *credp)
1631 {
1632 	return (icmp_tpi_open(q, devp, flag, sflag, credp, AF_INET));
1633 }
1634 
1635 /* For /dev/icmp6 aka AF_INET6 open */
1636 static int
1637 icmp_openv6(queue_t *q, dev_t *devp, int flag, int sflag, cred_t *credp)
1638 {
1639 	return (icmp_tpi_open(q, devp, flag, sflag, credp, AF_INET6));
1640 }
1641 
1642 /*
1643  * This is the open routine for icmp.  It allocates a icmp_t structure for
1644  * the stream and, on the first open of the module, creates an ND table.
1645  */
1646 static conn_t *
1647 rawip_do_open(int family, cred_t *credp, int *err, int flags)
1648 {
1649 	icmp_t	*icmp;
1650 	conn_t *connp;
1651 	zoneid_t zoneid;
1652 	netstack_t *ns;
1653 	icmp_stack_t *is;
1654 	int len;
1655 	boolean_t isv6 = B_FALSE;
1656 
1657 	*err = secpolicy_net_icmpaccess(credp);
1658 	if (*err != 0)
1659 		return (NULL);
1660 
1661 	if (family == AF_INET6)
1662 		isv6 = B_TRUE;
1663 
1664 	ns = netstack_find_by_cred(credp);
1665 	ASSERT(ns != NULL);
1666 	is = ns->netstack_icmp;
1667 	ASSERT(is != NULL);
1668 
1669 	/*
1670 	 * For exclusive stacks we set the zoneid to zero
1671 	 * to make ICMP operate as if in the global zone.
1672 	 */
1673 	if (ns->netstack_stackid != GLOBAL_NETSTACKID)
1674 		zoneid = GLOBAL_ZONEID;
1675 	else
1676 		zoneid = crgetzoneid(credp);
1677 
1678 	ASSERT(flags == KM_SLEEP || flags == KM_NOSLEEP);
1679 
1680 	connp = ipcl_conn_create(IPCL_RAWIPCONN, flags, ns);
1681 	icmp = connp->conn_icmp;
1682 
1683 	/*
1684 	 * ipcl_conn_create did a netstack_hold. Undo the hold that was
1685 	 * done by netstack_find_by_cred()
1686 	 */
1687 	netstack_rele(ns);
1688 
1689 	/*
1690 	 * Since this conn_t/icmp_t is not yet visible to anybody else we don't
1691 	 * need to lock anything.
1692 	 */
1693 	ASSERT(connp->conn_proto == IPPROTO_ICMP);
1694 	ASSERT(connp->conn_icmp == icmp);
1695 	ASSERT(icmp->icmp_connp == connp);
1696 
1697 	/* Set the initial state of the stream and the privilege status. */
1698 	icmp->icmp_state = TS_UNBND;
1699 	connp->conn_ixa->ixa_flags |= IXAF_VERIFY_SOURCE;
1700 	if (isv6) {
1701 		connp->conn_family = AF_INET6;
1702 		connp->conn_ipversion = IPV6_VERSION;
1703 		connp->conn_ixa->ixa_flags &= ~IXAF_IS_IPV4;
1704 		connp->conn_proto = IPPROTO_ICMPV6;
1705 		/* May be changed by a SO_PROTOTYPE socket option. */
1706 		connp->conn_proto = IPPROTO_ICMPV6;
1707 		connp->conn_ixa->ixa_protocol = connp->conn_proto;
1708 		connp->conn_ixa->ixa_raw_cksum_offset = 2;
1709 		connp->conn_default_ttl = is->is_ipv6_hoplimit;
1710 		len = sizeof (ip6_t);
1711 	} else {
1712 		connp->conn_family = AF_INET;
1713 		connp->conn_ipversion = IPV4_VERSION;
1714 		connp->conn_ixa->ixa_flags |= IXAF_IS_IPV4;
1715 		/* May be changed by a SO_PROTOTYPE socket option. */
1716 		connp->conn_proto = IPPROTO_ICMP;
1717 		connp->conn_ixa->ixa_protocol = connp->conn_proto;
1718 		connp->conn_default_ttl = is->is_ipv4_ttl;
1719 		len = sizeof (ipha_t);
1720 	}
1721 	connp->conn_xmit_ipp.ipp_unicast_hops = connp->conn_default_ttl;
1722 
1723 	connp->conn_ixa->ixa_multicast_ttl = IP_DEFAULT_MULTICAST_TTL;
1724 
1725 	/*
1726 	 * For the socket of protocol IPPROTO_RAW or when IP_HDRINCL is set,
1727 	 * the checksum is provided in the pre-built packet. We clear
1728 	 * IXAF_SET_ULP_CKSUM to tell IP that the application has sent a
1729 	 * complete IP header and not to compute the transport checksum.
1730 	 */
1731 	connp->conn_ixa->ixa_flags |= IXAF_MULTICAST_LOOP | IXAF_SET_ULP_CKSUM;
1732 	/* conn_allzones can not be set this early, hence no IPCL_ZONEID */
1733 	connp->conn_ixa->ixa_zoneid = zoneid;
1734 
1735 	connp->conn_zoneid = zoneid;
1736 
1737 	/*
1738 	 * If the caller has the process-wide flag set, then default to MAC
1739 	 * exempt mode.  This allows read-down to unlabeled hosts.
1740 	 */
1741 	if (getpflags(NET_MAC_AWARE, credp) != 0)
1742 		connp->conn_mac_mode = CONN_MAC_AWARE;
1743 
1744 	connp->conn_zone_is_global = (crgetzoneid(credp) == GLOBAL_ZONEID);
1745 
1746 	icmp->icmp_is = is;
1747 
1748 	connp->conn_rcvbuf = is->is_recv_hiwat;
1749 	connp->conn_sndbuf = is->is_xmit_hiwat;
1750 	connp->conn_sndlowat = is->is_xmit_lowat;
1751 	connp->conn_rcvlowat = icmp_mod_info.mi_lowat;
1752 
1753 	connp->conn_wroff = len + is->is_wroff_extra;
1754 	connp->conn_so_type = SOCK_RAW;
1755 
1756 	connp->conn_recv = icmp_input;
1757 	connp->conn_recvicmp = icmp_icmp_input;
1758 	crhold(credp);
1759 	connp->conn_cred = credp;
1760 	connp->conn_cpid = curproc->p_pid;
1761 	connp->conn_open_time = ddi_get_lbolt64();
1762 	/* Cache things in ixa without an extra refhold */
1763 	ASSERT(!(connp->conn_ixa->ixa_free_flags & IXA_FREE_CRED));
1764 	connp->conn_ixa->ixa_cred = connp->conn_cred;
1765 	connp->conn_ixa->ixa_cpid = connp->conn_cpid;
1766 	if (is_system_labeled())
1767 		connp->conn_ixa->ixa_tsl = crgetlabel(connp->conn_cred);
1768 
1769 	connp->conn_flow_cntrld = B_FALSE;
1770 
1771 	if (is->is_pmtu_discovery)
1772 		connp->conn_ixa->ixa_flags |= IXAF_PMTU_DISCOVERY;
1773 
1774 	return (connp);
1775 }
1776 
1777 /*
1778  * Which ICMP options OK to set through T_UNITDATA_REQ...
1779  */
1780 /* ARGSUSED */
1781 static boolean_t
1782 icmp_opt_allow_udr_set(t_scalar_t level, t_scalar_t name)
1783 {
1784 	return (B_TRUE);
1785 }
1786 
1787 /*
1788  * This routine gets default values of certain options whose default
1789  * values are maintained by protcol specific code
1790  */
1791 int
1792 icmp_opt_default(queue_t *q, t_scalar_t level, t_scalar_t name, uchar_t *ptr)
1793 {
1794 	icmp_t *icmp = Q_TO_ICMP(q);
1795 	icmp_stack_t *is = icmp->icmp_is;
1796 	int *i1 = (int *)ptr;
1797 
1798 	switch (level) {
1799 	case IPPROTO_IP:
1800 		switch (name) {
1801 		case IP_MULTICAST_TTL:
1802 			*ptr = (uchar_t)IP_DEFAULT_MULTICAST_TTL;
1803 			return (sizeof (uchar_t));
1804 		case IP_MULTICAST_LOOP:
1805 			*ptr = (uchar_t)IP_DEFAULT_MULTICAST_LOOP;
1806 			return (sizeof (uchar_t));
1807 		}
1808 		break;
1809 	case IPPROTO_IPV6:
1810 		switch (name) {
1811 		case IPV6_MULTICAST_HOPS:
1812 			*i1 = IP_DEFAULT_MULTICAST_TTL;
1813 			return (sizeof (int));
1814 		case IPV6_MULTICAST_LOOP:
1815 			*i1 = IP_DEFAULT_MULTICAST_LOOP;
1816 			return (sizeof (int));
1817 		case IPV6_UNICAST_HOPS:
1818 			*i1 = is->is_ipv6_hoplimit;
1819 			return (sizeof (int));
1820 		}
1821 		break;
1822 	case IPPROTO_ICMPV6:
1823 		switch (name) {
1824 		case ICMP6_FILTER:
1825 			/* Make it look like "pass all" */
1826 			ICMP6_FILTER_SETPASSALL((icmp6_filter_t *)ptr);
1827 			return (sizeof (icmp6_filter_t));
1828 		}
1829 		break;
1830 	}
1831 	return (-1);
1832 }
1833 
1834 /*
1835  * This routine retrieves the current status of socket options.
1836  * It returns the size of the option retrieved, or -1.
1837  */
1838 int
1839 icmp_opt_get(conn_t *connp, int level, int name, uchar_t *ptr)
1840 {
1841 	icmp_t		*icmp = connp->conn_icmp;
1842 	int		*i1 = (int *)ptr;
1843 	conn_opt_arg_t	coas;
1844 	int		retval;
1845 
1846 	coas.coa_connp = connp;
1847 	coas.coa_ixa = connp->conn_ixa;
1848 	coas.coa_ipp = &connp->conn_xmit_ipp;
1849 	coas.coa_ancillary = B_FALSE;
1850 	coas.coa_changed = 0;
1851 
1852 	/*
1853 	 * We assume that the optcom framework has checked for the set
1854 	 * of levels and names that are supported, hence we don't worry
1855 	 * about rejecting based on that.
1856 	 * First check for ICMP specific handling, then pass to common routine.
1857 	 */
1858 	switch (level) {
1859 	case IPPROTO_IP:
1860 		/*
1861 		 * Only allow IPv4 option processing on IPv4 sockets.
1862 		 */
1863 		if (connp->conn_family != AF_INET)
1864 			return (-1);
1865 
1866 		switch (name) {
1867 		case IP_OPTIONS:
1868 		case T_IP_OPTIONS:
1869 			/* Options are passed up with each packet */
1870 			return (0);
1871 		case IP_HDRINCL:
1872 			mutex_enter(&connp->conn_lock);
1873 			*i1 = (int)icmp->icmp_hdrincl;
1874 			mutex_exit(&connp->conn_lock);
1875 			return (sizeof (int));
1876 		}
1877 		break;
1878 
1879 	case IPPROTO_IPV6:
1880 		/*
1881 		 * Only allow IPv6 option processing on native IPv6 sockets.
1882 		 */
1883 		if (connp->conn_family != AF_INET6)
1884 			return (-1);
1885 
1886 		switch (name) {
1887 		case IPV6_CHECKSUM:
1888 			/*
1889 			 * Return offset or -1 if no checksum offset.
1890 			 * Does not apply to IPPROTO_ICMPV6
1891 			 */
1892 			if (connp->conn_proto == IPPROTO_ICMPV6)
1893 				return (-1);
1894 
1895 			mutex_enter(&connp->conn_lock);
1896 			if (connp->conn_ixa->ixa_flags & IXAF_SET_RAW_CKSUM)
1897 				*i1 = connp->conn_ixa->ixa_raw_cksum_offset;
1898 			else
1899 				*i1 = -1;
1900 			mutex_exit(&connp->conn_lock);
1901 			return (sizeof (int));
1902 		}
1903 		break;
1904 
1905 	case IPPROTO_ICMPV6:
1906 		/*
1907 		 * Only allow IPv6 option processing on native IPv6 sockets.
1908 		 */
1909 		if (connp->conn_family != AF_INET6)
1910 			return (-1);
1911 
1912 		if (connp->conn_proto != IPPROTO_ICMPV6)
1913 			return (-1);
1914 
1915 		switch (name) {
1916 		case ICMP6_FILTER:
1917 			mutex_enter(&connp->conn_lock);
1918 			if (icmp->icmp_filter == NULL) {
1919 				/* Make it look like "pass all" */
1920 				ICMP6_FILTER_SETPASSALL((icmp6_filter_t *)ptr);
1921 			} else {
1922 				(void) bcopy(icmp->icmp_filter, ptr,
1923 				    sizeof (icmp6_filter_t));
1924 			}
1925 			mutex_exit(&connp->conn_lock);
1926 			return (sizeof (icmp6_filter_t));
1927 		}
1928 	}
1929 	mutex_enter(&connp->conn_lock);
1930 	retval = conn_opt_get(&coas, level, name, ptr);
1931 	mutex_exit(&connp->conn_lock);
1932 	return (retval);
1933 }
1934 
1935 /*
1936  * This routine retrieves the current status of socket options.
1937  * It returns the size of the option retrieved, or -1.
1938  */
1939 int
1940 icmp_tpi_opt_get(queue_t *q, int level, int name, uchar_t *ptr)
1941 {
1942 	conn_t		*connp = Q_TO_CONN(q);
1943 	int 		err;
1944 
1945 	err = icmp_opt_get(connp, level, name, ptr);
1946 	return (err);
1947 }
1948 
1949 /*
1950  * This routine sets socket options.
1951  */
1952 int
1953 icmp_do_opt_set(conn_opt_arg_t *coa, int level, int name,
1954     uint_t inlen, uchar_t *invalp, cred_t *cr, boolean_t checkonly)
1955 {
1956 	conn_t		*connp = coa->coa_connp;
1957 	ip_xmit_attr_t	*ixa = coa->coa_ixa;
1958 	icmp_t		*icmp = connp->conn_icmp;
1959 	icmp_stack_t	*is = icmp->icmp_is;
1960 	int		*i1 = (int *)invalp;
1961 	boolean_t	onoff = (*i1 == 0) ? 0 : 1;
1962 	int		error;
1963 
1964 	ASSERT(MUTEX_NOT_HELD(&coa->coa_connp->conn_lock));
1965 
1966 	/*
1967 	 * For fixed length options, no sanity check
1968 	 * of passed in length is done. It is assumed *_optcom_req()
1969 	 * routines do the right thing.
1970 	 */
1971 
1972 	switch (level) {
1973 	case SOL_SOCKET:
1974 		switch (name) {
1975 		case SO_PROTOTYPE:
1976 			if ((*i1 & 0xFF) != IPPROTO_ICMP &&
1977 			    (*i1 & 0xFF) != IPPROTO_ICMPV6 &&
1978 			    secpolicy_net_rawaccess(cr) != 0) {
1979 				return (EACCES);
1980 			}
1981 			if (checkonly)
1982 				break;
1983 
1984 			mutex_enter(&connp->conn_lock);
1985 			connp->conn_proto = *i1 & 0xFF;
1986 			ixa->ixa_protocol = connp->conn_proto;
1987 			if ((connp->conn_proto == IPPROTO_RAW ||
1988 			    connp->conn_proto == IPPROTO_IGMP) &&
1989 			    connp->conn_family == AF_INET) {
1990 				icmp->icmp_hdrincl = 1;
1991 				ixa->ixa_flags &= ~IXAF_SET_ULP_CKSUM;
1992 			} else if (connp->conn_proto == IPPROTO_UDP ||
1993 			    connp->conn_proto == IPPROTO_TCP ||
1994 			    connp->conn_proto == IPPROTO_SCTP) {
1995 				/* Used by test applications like psh */
1996 				icmp->icmp_hdrincl = 0;
1997 				ixa->ixa_flags &= ~IXAF_SET_ULP_CKSUM;
1998 			} else {
1999 				icmp->icmp_hdrincl = 0;
2000 				ixa->ixa_flags |= IXAF_SET_ULP_CKSUM;
2001 			}
2002 
2003 			if (connp->conn_family == AF_INET6 &&
2004 			    connp->conn_proto == IPPROTO_ICMPV6) {
2005 				/* Set offset for icmp6_cksum */
2006 				ixa->ixa_flags &= ~IXAF_SET_RAW_CKSUM;
2007 				ixa->ixa_raw_cksum_offset = 2;
2008 			}
2009 			if (icmp->icmp_filter != NULL &&
2010 			    connp->conn_proto != IPPROTO_ICMPV6) {
2011 				kmem_free(icmp->icmp_filter,
2012 				    sizeof (icmp6_filter_t));
2013 				icmp->icmp_filter = NULL;
2014 			}
2015 			mutex_exit(&connp->conn_lock);
2016 
2017 			coa->coa_changed |= COA_HEADER_CHANGED;
2018 			/*
2019 			 * For SCTP, we don't use icmp_bind_proto() for
2020 			 * raw socket binding.
2021 			 */
2022 			if (connp->conn_proto == IPPROTO_SCTP)
2023 				return (0);
2024 
2025 			coa->coa_changed |= COA_ICMP_BIND_NEEDED;
2026 			return (0);
2027 
2028 		case SO_SNDBUF:
2029 			if (*i1 > is->is_max_buf) {
2030 				return (ENOBUFS);
2031 			}
2032 			break;
2033 		case SO_RCVBUF:
2034 			if (*i1 > is->is_max_buf) {
2035 				return (ENOBUFS);
2036 			}
2037 			break;
2038 		}
2039 		break;
2040 
2041 	case IPPROTO_IP:
2042 		/*
2043 		 * Only allow IPv4 option processing on IPv4 sockets.
2044 		 */
2045 		if (connp->conn_family != AF_INET)
2046 			return (EINVAL);
2047 
2048 		switch (name) {
2049 		case IP_HDRINCL:
2050 			if (!checkonly) {
2051 				mutex_enter(&connp->conn_lock);
2052 				icmp->icmp_hdrincl = onoff;
2053 				if (onoff)
2054 					ixa->ixa_flags &= ~IXAF_SET_ULP_CKSUM;
2055 				else
2056 					ixa->ixa_flags |= IXAF_SET_ULP_CKSUM;
2057 				mutex_exit(&connp->conn_lock);
2058 			}
2059 			break;
2060 		}
2061 		break;
2062 
2063 	case IPPROTO_IPV6:
2064 		if (connp->conn_family != AF_INET6)
2065 			return (EINVAL);
2066 
2067 		switch (name) {
2068 		case IPV6_CHECKSUM:
2069 			/*
2070 			 * Integer offset into the user data of where the
2071 			 * checksum is located.
2072 			 * Offset of -1 disables option.
2073 			 * Does not apply to IPPROTO_ICMPV6.
2074 			 */
2075 			if (connp->conn_proto == IPPROTO_ICMPV6 ||
2076 			    coa->coa_ancillary) {
2077 				return (EINVAL);
2078 			}
2079 			if ((*i1 != -1) && ((*i1 < 0) || (*i1 & 0x1) != 0)) {
2080 				/* Negative or not 16 bit aligned offset */
2081 				return (EINVAL);
2082 			}
2083 			if (checkonly)
2084 				break;
2085 
2086 			mutex_enter(&connp->conn_lock);
2087 			if (*i1 == -1) {
2088 				ixa->ixa_flags &= ~IXAF_SET_RAW_CKSUM;
2089 				ixa->ixa_raw_cksum_offset = 0;
2090 				ixa->ixa_flags &= ~IXAF_SET_ULP_CKSUM;
2091 			} else {
2092 				ixa->ixa_flags |= IXAF_SET_RAW_CKSUM;
2093 				ixa->ixa_raw_cksum_offset = *i1;
2094 				ixa->ixa_flags |= IXAF_SET_ULP_CKSUM;
2095 			}
2096 			mutex_exit(&connp->conn_lock);
2097 			break;
2098 		}
2099 		break;
2100 
2101 	case IPPROTO_ICMPV6:
2102 		/*
2103 		 * Only allow IPv6 option processing on IPv6 sockets.
2104 		 */
2105 		if (connp->conn_family != AF_INET6)
2106 			return (EINVAL);
2107 		if (connp->conn_proto != IPPROTO_ICMPV6)
2108 			return (EINVAL);
2109 
2110 		switch (name) {
2111 		case ICMP6_FILTER:
2112 			if (checkonly)
2113 				break;
2114 
2115 			if ((inlen != 0) &&
2116 			    (inlen != sizeof (icmp6_filter_t)))
2117 				return (EINVAL);
2118 
2119 			mutex_enter(&connp->conn_lock);
2120 			if (inlen == 0) {
2121 				if (icmp->icmp_filter != NULL) {
2122 					kmem_free(icmp->icmp_filter,
2123 					    sizeof (icmp6_filter_t));
2124 					icmp->icmp_filter = NULL;
2125 				}
2126 			} else {
2127 				if (icmp->icmp_filter == NULL) {
2128 					icmp->icmp_filter = kmem_alloc(
2129 					    sizeof (icmp6_filter_t),
2130 					    KM_NOSLEEP);
2131 					if (icmp->icmp_filter == NULL) {
2132 						mutex_exit(&connp->conn_lock);
2133 						return (ENOBUFS);
2134 					}
2135 				}
2136 				(void) bcopy(invalp, icmp->icmp_filter, inlen);
2137 			}
2138 			mutex_exit(&connp->conn_lock);
2139 			break;
2140 		}
2141 		break;
2142 	}
2143 	error = conn_opt_set(coa, level, name, inlen, invalp,
2144 	    checkonly, cr);
2145 	return (error);
2146 }
2147 
2148 /*
2149  * This routine sets socket options.
2150  */
2151 int
2152 icmp_opt_set(conn_t *connp, uint_t optset_context, int level, int name,
2153     uint_t inlen, uchar_t *invalp, uint_t *outlenp, uchar_t *outvalp,
2154     void *thisdg_attrs, cred_t *cr)
2155 {
2156 	icmp_t		*icmp = connp->conn_icmp;
2157 	int		err;
2158 	conn_opt_arg_t	coas, *coa;
2159 	boolean_t	checkonly;
2160 	icmp_stack_t	*is = icmp->icmp_is;
2161 
2162 	switch (optset_context) {
2163 	case SETFN_OPTCOM_CHECKONLY:
2164 		checkonly = B_TRUE;
2165 		/*
2166 		 * Note: Implies T_CHECK semantics for T_OPTCOM_REQ
2167 		 * inlen != 0 implies value supplied and
2168 		 * 	we have to "pretend" to set it.
2169 		 * inlen == 0 implies that there is no
2170 		 * 	value part in T_CHECK request and just validation
2171 		 * done elsewhere should be enough, we just return here.
2172 		 */
2173 		if (inlen == 0) {
2174 			*outlenp = 0;
2175 			return (0);
2176 		}
2177 		break;
2178 	case SETFN_OPTCOM_NEGOTIATE:
2179 		checkonly = B_FALSE;
2180 		break;
2181 	case SETFN_UD_NEGOTIATE:
2182 	case SETFN_CONN_NEGOTIATE:
2183 		checkonly = B_FALSE;
2184 		/*
2185 		 * Negotiating local and "association-related" options
2186 		 * through T_UNITDATA_REQ.
2187 		 *
2188 		 * Following routine can filter out ones we do not
2189 		 * want to be "set" this way.
2190 		 */
2191 		if (!icmp_opt_allow_udr_set(level, name)) {
2192 			*outlenp = 0;
2193 			return (EINVAL);
2194 		}
2195 		break;
2196 	default:
2197 		/*
2198 		 * We should never get here
2199 		 */
2200 		*outlenp = 0;
2201 		return (EINVAL);
2202 	}
2203 
2204 	ASSERT((optset_context != SETFN_OPTCOM_CHECKONLY) ||
2205 	    (optset_context == SETFN_OPTCOM_CHECKONLY && inlen != 0));
2206 
2207 	if (thisdg_attrs != NULL) {
2208 		/* Options from T_UNITDATA_REQ */
2209 		coa = (conn_opt_arg_t *)thisdg_attrs;
2210 		ASSERT(coa->coa_connp == connp);
2211 		ASSERT(coa->coa_ixa != NULL);
2212 		ASSERT(coa->coa_ipp != NULL);
2213 		ASSERT(coa->coa_ancillary);
2214 	} else {
2215 		coa = &coas;
2216 		coas.coa_connp = connp;
2217 		/* Get a reference on conn_ixa to prevent concurrent mods */
2218 		coas.coa_ixa = conn_get_ixa(connp, B_TRUE);
2219 		if (coas.coa_ixa == NULL) {
2220 			*outlenp = 0;
2221 			return (ENOMEM);
2222 		}
2223 		coas.coa_ipp = &connp->conn_xmit_ipp;
2224 		coas.coa_ancillary = B_FALSE;
2225 		coas.coa_changed = 0;
2226 	}
2227 
2228 	err = icmp_do_opt_set(coa, level, name, inlen, invalp,
2229 	    cr, checkonly);
2230 	if (err != 0) {
2231 errout:
2232 		if (!coa->coa_ancillary)
2233 			ixa_refrele(coa->coa_ixa);
2234 		*outlenp = 0;
2235 		return (err);
2236 	}
2237 
2238 	/*
2239 	 * Common case of OK return with outval same as inval.
2240 	 */
2241 	if (invalp != outvalp) {
2242 		/* don't trust bcopy for identical src/dst */
2243 		(void) bcopy(invalp, outvalp, inlen);
2244 	}
2245 	*outlenp = inlen;
2246 
2247 	/*
2248 	 * If this was not ancillary data, then we rebuild the headers,
2249 	 * update the IRE/NCE, and IPsec as needed.
2250 	 * Since the label depends on the destination we go through
2251 	 * ip_set_destination first.
2252 	 */
2253 	if (coa->coa_ancillary) {
2254 		return (0);
2255 	}
2256 
2257 	if (coa->coa_changed & COA_ROUTE_CHANGED) {
2258 		in6_addr_t saddr, faddr, nexthop;
2259 		in_port_t fport;
2260 
2261 		/*
2262 		 * We clear lastdst to make sure we pick up the change
2263 		 * next time sending.
2264 		 * If we are connected we re-cache the information.
2265 		 * We ignore errors to preserve BSD behavior.
2266 		 * Note that we don't redo IPsec policy lookup here
2267 		 * since the final destination (or source) didn't change.
2268 		 */
2269 		mutex_enter(&connp->conn_lock);
2270 		connp->conn_v6lastdst = ipv6_all_zeros;
2271 
2272 		ip_attr_nexthop(coa->coa_ipp, coa->coa_ixa,
2273 		    &connp->conn_faddr_v6, &nexthop);
2274 		saddr = connp->conn_saddr_v6;
2275 		faddr = connp->conn_faddr_v6;
2276 		fport = connp->conn_fport;
2277 		mutex_exit(&connp->conn_lock);
2278 
2279 		if (!IN6_IS_ADDR_UNSPECIFIED(&faddr) &&
2280 		    !IN6_IS_ADDR_V4MAPPED_ANY(&faddr)) {
2281 			(void) ip_attr_connect(connp, coa->coa_ixa,
2282 			    &saddr, &faddr, &nexthop, fport, NULL, NULL,
2283 			    IPDF_ALLOW_MCBC | IPDF_VERIFY_DST);
2284 		}
2285 	}
2286 
2287 	ixa_refrele(coa->coa_ixa);
2288 
2289 	if (coa->coa_changed & COA_HEADER_CHANGED) {
2290 		/*
2291 		 * Rebuild the header template if we are connected.
2292 		 * Otherwise clear conn_v6lastdst so we rebuild the header
2293 		 * in the data path.
2294 		 */
2295 		mutex_enter(&connp->conn_lock);
2296 		if (!IN6_IS_ADDR_UNSPECIFIED(&connp->conn_faddr_v6) &&
2297 		    !IN6_IS_ADDR_V4MAPPED_ANY(&connp->conn_faddr_v6)) {
2298 			err = icmp_build_hdr_template(connp,
2299 			    &connp->conn_saddr_v6, &connp->conn_faddr_v6,
2300 			    connp->conn_flowinfo);
2301 			if (err != 0) {
2302 				mutex_exit(&connp->conn_lock);
2303 				return (err);
2304 			}
2305 		} else {
2306 			connp->conn_v6lastdst = ipv6_all_zeros;
2307 		}
2308 		mutex_exit(&connp->conn_lock);
2309 	}
2310 	if (coa->coa_changed & COA_RCVBUF_CHANGED) {
2311 		(void) proto_set_rx_hiwat(connp->conn_rq, connp,
2312 		    connp->conn_rcvbuf);
2313 	}
2314 	if ((coa->coa_changed & COA_SNDBUF_CHANGED) && !IPCL_IS_NONSTR(connp)) {
2315 		connp->conn_wq->q_hiwat = connp->conn_sndbuf;
2316 	}
2317 	if (coa->coa_changed & COA_WROFF_CHANGED) {
2318 		/* Increase wroff if needed */
2319 		uint_t wroff;
2320 
2321 		mutex_enter(&connp->conn_lock);
2322 		wroff = connp->conn_ht_iphc_allocated + is->is_wroff_extra;
2323 		if (wroff > connp->conn_wroff) {
2324 			connp->conn_wroff = wroff;
2325 			mutex_exit(&connp->conn_lock);
2326 			(void) proto_set_tx_wroff(connp->conn_rq, connp, wroff);
2327 		} else {
2328 			mutex_exit(&connp->conn_lock);
2329 		}
2330 	}
2331 	if (coa->coa_changed & COA_ICMP_BIND_NEEDED) {
2332 		icmp_bind_proto(icmp);
2333 	}
2334 	return (err);
2335 }
2336 
2337 /* This routine sets socket options. */
2338 int
2339 icmp_tpi_opt_set(queue_t *q, uint_t optset_context, int level, int name,
2340     uint_t inlen, uchar_t *invalp, uint_t *outlenp, uchar_t *outvalp,
2341     void *thisdg_attrs, cred_t *cr)
2342 {
2343 	conn_t	*connp = Q_TO_CONN(q);
2344 	int error;
2345 
2346 	error = icmp_opt_set(connp, optset_context, level, name, inlen, invalp,
2347 	    outlenp, outvalp, thisdg_attrs, cr);
2348 	return (error);
2349 }
2350 
2351 /*
2352  * Setup IP headers.
2353  *
2354  * Note that IP_HDRINCL has ipha_protocol that is different than conn_proto,
2355  * but icmp_output_hdrincl restores ipha_protocol once we return.
2356  */
2357 mblk_t *
2358 icmp_prepend_hdr(conn_t *connp, ip_xmit_attr_t *ixa, const ip_pkt_t *ipp,
2359     const in6_addr_t *v6src, const in6_addr_t *v6dst, uint32_t flowinfo,
2360     mblk_t *data_mp, int *errorp)
2361 {
2362 	mblk_t		*mp;
2363 	icmp_stack_t	*is = connp->conn_netstack->netstack_icmp;
2364 	uint_t		data_len;
2365 	uint32_t	cksum;
2366 
2367 	data_len = msgdsize(data_mp);
2368 	mp = conn_prepend_hdr(ixa, ipp, v6src, v6dst, connp->conn_proto,
2369 	    flowinfo, 0, data_mp, data_len, is->is_wroff_extra, &cksum, errorp);
2370 	if (mp == NULL) {
2371 		ASSERT(*errorp != 0);
2372 		return (NULL);
2373 	}
2374 
2375 	ixa->ixa_pktlen = data_len + ixa->ixa_ip_hdr_length;
2376 
2377 	/*
2378 	 * If there was a routing option/header then conn_prepend_hdr
2379 	 * has massaged it and placed the pseudo-header checksum difference
2380 	 * in the cksum argument.
2381 	 *
2382 	 * Prepare for ICMPv6 checksum done in IP.
2383 	 *
2384 	 * We make it easy for IP to include our pseudo header
2385 	 * by putting our length (and any routing header adjustment)
2386 	 * in the ICMPv6 checksum field.
2387 	 * The IP source, destination, and length have already been set by
2388 	 * conn_prepend_hdr.
2389 	 */
2390 	cksum += data_len;
2391 	cksum = (cksum >> 16) + (cksum & 0xFFFF);
2392 	ASSERT(cksum < 0x10000);
2393 
2394 	if (ixa->ixa_flags & IXAF_IS_IPV4) {
2395 		ipha_t	*ipha = (ipha_t *)mp->b_rptr;
2396 
2397 		ASSERT(ntohs(ipha->ipha_length) == ixa->ixa_pktlen);
2398 	} else {
2399 		ip6_t	*ip6h = (ip6_t *)mp->b_rptr;
2400 		uint_t	cksum_offset = 0;
2401 
2402 		ASSERT(ntohs(ip6h->ip6_plen) + IPV6_HDR_LEN == ixa->ixa_pktlen);
2403 
2404 		if (ixa->ixa_flags & IXAF_SET_ULP_CKSUM) {
2405 			if (connp->conn_proto == IPPROTO_ICMPV6) {
2406 				cksum_offset = ixa->ixa_ip_hdr_length +
2407 				    offsetof(icmp6_t, icmp6_cksum);
2408 			} else if (ixa->ixa_flags & IXAF_SET_RAW_CKSUM) {
2409 				cksum_offset = ixa->ixa_ip_hdr_length +
2410 				    ixa->ixa_raw_cksum_offset;
2411 			}
2412 		}
2413 		if (cksum_offset != 0) {
2414 			uint16_t *ptr;
2415 
2416 			/* Make sure the checksum fits in the first mblk */
2417 			if (cksum_offset + sizeof (short) > MBLKL(mp)) {
2418 				mblk_t *mp1;
2419 
2420 				mp1 = msgpullup(mp,
2421 				    cksum_offset + sizeof (short));
2422 				freemsg(mp);
2423 				if (mp1 == NULL) {
2424 					*errorp = ENOMEM;
2425 					return (NULL);
2426 				}
2427 				mp = mp1;
2428 				ip6h = (ip6_t *)mp->b_rptr;
2429 			}
2430 			ptr = (uint16_t *)(mp->b_rptr + cksum_offset);
2431 			*ptr = htons(cksum);
2432 		}
2433 	}
2434 
2435 	/* Note that we don't try to update wroff due to ancillary data */
2436 	return (mp);
2437 }
2438 
2439 static int
2440 icmp_build_hdr_template(conn_t *connp, const in6_addr_t *v6src,
2441     const in6_addr_t *v6dst, uint32_t flowinfo)
2442 {
2443 	int		error;
2444 
2445 	ASSERT(MUTEX_HELD(&connp->conn_lock));
2446 	/*
2447 	 * We clear lastdst to make sure we don't use the lastdst path
2448 	 * next time sending since we might not have set v6dst yet.
2449 	 */
2450 	connp->conn_v6lastdst = ipv6_all_zeros;
2451 
2452 	error = conn_build_hdr_template(connp, 0, 0, v6src, v6dst, flowinfo);
2453 	if (error != 0)
2454 		return (error);
2455 
2456 	/*
2457 	 * Any routing header/option has been massaged. The checksum difference
2458 	 * is stored in conn_sum.
2459 	 */
2460 	return (0);
2461 }
2462 
2463 static mblk_t *
2464 icmp_queue_fallback(icmp_t *icmp, mblk_t *mp)
2465 {
2466 	ASSERT(MUTEX_HELD(&icmp->icmp_recv_lock));
2467 	if (IPCL_IS_NONSTR(icmp->icmp_connp)) {
2468 		/*
2469 		 * fallback has started but messages have not been moved yet
2470 		 */
2471 		if (icmp->icmp_fallback_queue_head == NULL) {
2472 			ASSERT(icmp->icmp_fallback_queue_tail == NULL);
2473 			icmp->icmp_fallback_queue_head = mp;
2474 			icmp->icmp_fallback_queue_tail = mp;
2475 		} else {
2476 			ASSERT(icmp->icmp_fallback_queue_tail != NULL);
2477 			icmp->icmp_fallback_queue_tail->b_next = mp;
2478 			icmp->icmp_fallback_queue_tail = mp;
2479 		}
2480 		return (NULL);
2481 	} else {
2482 		/*
2483 		 * Fallback completed, let the caller putnext() the mblk.
2484 		 */
2485 		return (mp);
2486 	}
2487 }
2488 
2489 /*
2490  * Deliver data to ULP. In case we have a socket, and it's falling back to
2491  * TPI, then we'll queue the mp for later processing.
2492  */
2493 static void
2494 icmp_ulp_recv(conn_t *connp, mblk_t *mp, uint_t len)
2495 {
2496 	if (IPCL_IS_NONSTR(connp)) {
2497 		icmp_t *icmp = connp->conn_icmp;
2498 		int error;
2499 
2500 		ASSERT(len == msgdsize(mp));
2501 		if ((*connp->conn_upcalls->su_recv)
2502 		    (connp->conn_upper_handle, mp, len, 0, &error, NULL) < 0) {
2503 			mutex_enter(&icmp->icmp_recv_lock);
2504 			if (error == ENOSPC) {
2505 				/*
2506 				 * let's confirm while holding the lock
2507 				 */
2508 				if ((*connp->conn_upcalls->su_recv)
2509 				    (connp->conn_upper_handle, NULL, 0, 0,
2510 				    &error, NULL) < 0) {
2511 					ASSERT(error == ENOSPC);
2512 					if (error == ENOSPC) {
2513 						connp->conn_flow_cntrld =
2514 						    B_TRUE;
2515 					}
2516 				}
2517 				mutex_exit(&icmp->icmp_recv_lock);
2518 			} else {
2519 				ASSERT(error == EOPNOTSUPP);
2520 				mp = icmp_queue_fallback(icmp, mp);
2521 				mutex_exit(&icmp->icmp_recv_lock);
2522 				if (mp != NULL)
2523 					putnext(connp->conn_rq, mp);
2524 			}
2525 		}
2526 		ASSERT(MUTEX_NOT_HELD(&icmp->icmp_recv_lock));
2527 	} else {
2528 		putnext(connp->conn_rq, mp);
2529 	}
2530 }
2531 
2532 /*
2533  * This is the inbound data path.
2534  * IP has already pulled up the IP headers and verified alignment
2535  * etc.
2536  */
2537 /* ARGSUSED2 */
2538 static void
2539 icmp_input(void *arg1, mblk_t *mp, void *arg2, ip_recv_attr_t *ira)
2540 {
2541 	conn_t			*connp = (conn_t *)arg1;
2542 	struct T_unitdata_ind	*tudi;
2543 	uchar_t			*rptr;		/* Pointer to IP header */
2544 	int			ip_hdr_length;
2545 	int			udi_size;	/* Size of T_unitdata_ind */
2546 	int			pkt_len;
2547 	icmp_t			*icmp;
2548 	ip_pkt_t		ipps;
2549 	ip6_t			*ip6h;
2550 	mblk_t			*mp1;
2551 	crb_t			recv_ancillary;
2552 	icmp_stack_t		*is;
2553 	sin_t			*sin;
2554 	sin6_t			*sin6;
2555 	ipha_t			*ipha;
2556 
2557 	ASSERT(connp->conn_flags & IPCL_RAWIPCONN);
2558 
2559 	icmp = connp->conn_icmp;
2560 	is = icmp->icmp_is;
2561 	rptr = mp->b_rptr;
2562 
2563 	ASSERT(DB_TYPE(mp) == M_DATA);
2564 	ASSERT(OK_32PTR(rptr));
2565 	ASSERT(ira->ira_pktlen == msgdsize(mp));
2566 	pkt_len = ira->ira_pktlen;
2567 
2568 	/*
2569 	 * Get a snapshot of these and allow other threads to change
2570 	 * them after that. We need the same recv_ancillary when determining
2571 	 * the size as when adding the ancillary data items.
2572 	 */
2573 	mutex_enter(&connp->conn_lock);
2574 	recv_ancillary = connp->conn_recv_ancillary;
2575 	mutex_exit(&connp->conn_lock);
2576 
2577 	ip_hdr_length = ira->ira_ip_hdr_length;
2578 	ASSERT(MBLKL(mp) >= ip_hdr_length);	/* IP did a pullup */
2579 
2580 	/* Initialize regardless of IP version */
2581 	ipps.ipp_fields = 0;
2582 
2583 	if (ira->ira_flags & IRAF_IS_IPV4) {
2584 		ASSERT(IPH_HDR_VERSION(rptr) == IPV4_VERSION);
2585 		ASSERT(MBLKL(mp) >= sizeof (ipha_t));
2586 		ASSERT(ira->ira_ip_hdr_length == IPH_HDR_LENGTH(rptr));
2587 
2588 		ipha = (ipha_t *)mp->b_rptr;
2589 		if (recv_ancillary.crb_all != 0)
2590 			(void) ip_find_hdr_v4(ipha, &ipps, B_FALSE);
2591 
2592 		/*
2593 		 * BSD for some reason adjusts ipha_length to exclude the
2594 		 * IP header length. We do the same.
2595 		 */
2596 		if (is->is_bsd_compat) {
2597 			ushort_t len;
2598 
2599 			len = ntohs(ipha->ipha_length);
2600 			if (mp->b_datap->db_ref > 1) {
2601 				/*
2602 				 * Allocate a new IP header so that we can
2603 				 * modify ipha_length.
2604 				 */
2605 				mblk_t	*mp1;
2606 
2607 				mp1 = allocb(ip_hdr_length, BPRI_MED);
2608 				if (mp1 == NULL) {
2609 					freemsg(mp);
2610 					BUMP_MIB(&is->is_rawip_mib,
2611 					    rawipInErrors);
2612 					return;
2613 				}
2614 				bcopy(rptr, mp1->b_rptr, ip_hdr_length);
2615 				mp->b_rptr = rptr + ip_hdr_length;
2616 				rptr = mp1->b_rptr;
2617 				ipha = (ipha_t *)rptr;
2618 				mp1->b_cont = mp;
2619 				mp1->b_wptr = rptr + ip_hdr_length;
2620 				mp = mp1;
2621 			}
2622 			len -= ip_hdr_length;
2623 			ipha->ipha_length = htons(len);
2624 		}
2625 
2626 		/*
2627 		 * For RAW sockets we not pass ICMP/IPv4 packets to AF_INET6
2628 		 * sockets. This is ensured by icmp_bind and the IP fanout code.
2629 		 */
2630 		ASSERT(connp->conn_family == AF_INET);
2631 
2632 		/*
2633 		 * This is the inbound data path.  Packets are passed upstream
2634 		 * as T_UNITDATA_IND messages with full IPv4 headers still
2635 		 * attached.
2636 		 */
2637 
2638 		/*
2639 		 * Normally only send up the source address.
2640 		 * If any ancillary data items are wanted we add those.
2641 		 */
2642 		udi_size = sizeof (struct T_unitdata_ind) + sizeof (sin_t);
2643 		if (recv_ancillary.crb_all != 0) {
2644 			udi_size += conn_recvancillary_size(connp,
2645 			    recv_ancillary, ira, mp, &ipps);
2646 		}
2647 
2648 		/* Allocate a message block for the T_UNITDATA_IND structure. */
2649 		mp1 = allocb(udi_size, BPRI_MED);
2650 		if (mp1 == NULL) {
2651 			freemsg(mp);
2652 			BUMP_MIB(&is->is_rawip_mib, rawipInErrors);
2653 			return;
2654 		}
2655 		mp1->b_cont = mp;
2656 		tudi = (struct T_unitdata_ind *)mp1->b_rptr;
2657 		mp1->b_datap->db_type = M_PROTO;
2658 		mp1->b_wptr = (uchar_t *)tudi + udi_size;
2659 		tudi->PRIM_type = T_UNITDATA_IND;
2660 		tudi->SRC_length = sizeof (sin_t);
2661 		tudi->SRC_offset = sizeof (struct T_unitdata_ind);
2662 		sin = (sin_t *)&tudi[1];
2663 		*sin = sin_null;
2664 		sin->sin_family = AF_INET;
2665 		sin->sin_addr.s_addr = ipha->ipha_src;
2666 		*(uint32_t *)&sin->sin_zero[0] = 0;
2667 		*(uint32_t *)&sin->sin_zero[4] = 0;
2668 		tudi->OPT_offset =  sizeof (struct T_unitdata_ind) +
2669 		    sizeof (sin_t);
2670 		udi_size -= (sizeof (struct T_unitdata_ind) + sizeof (sin_t));
2671 		tudi->OPT_length = udi_size;
2672 
2673 		/*
2674 		 * Add options if IP_RECVIF etc is set
2675 		 */
2676 		if (udi_size != 0) {
2677 			conn_recvancillary_add(connp, recv_ancillary, ira,
2678 			    &ipps, (uchar_t *)&sin[1], udi_size);
2679 		}
2680 		goto deliver;
2681 	}
2682 
2683 	ASSERT(IPH_HDR_VERSION(rptr) == IPV6_VERSION);
2684 	/*
2685 	 * IPv6 packets can only be received by applications
2686 	 * that are prepared to receive IPv6 addresses.
2687 	 * The IP fanout must ensure this.
2688 	 */
2689 	ASSERT(connp->conn_family == AF_INET6);
2690 
2691 	/*
2692 	 * Handle IPv6 packets. We don't pass up the IP headers with the
2693 	 * payload for IPv6.
2694 	 */
2695 
2696 	ip6h = (ip6_t *)rptr;
2697 	if (recv_ancillary.crb_all != 0) {
2698 		/*
2699 		 * Call on ip_find_hdr_v6 which gets individual lenghts of
2700 		 * extension headers (and pointers to them).
2701 		 */
2702 		uint8_t		nexthdr;
2703 
2704 		/* We don't care about the length or nextheader. */
2705 		(void) ip_find_hdr_v6(mp, ip6h, B_TRUE, &ipps, &nexthdr);
2706 
2707 		/*
2708 		 * We do not pass up hop-by-hop options or any other
2709 		 * extension header as part of the packet. Applications
2710 		 * that want to see them have to specify IPV6_RECV* socket
2711 		 * options. And conn_recvancillary_size/add explicitly
2712 		 * drops the TX option from IPV6_HOPOPTS as it does for UDP.
2713 		 *
2714 		 * If we had multilevel ICMP sockets, then we'd want to
2715 		 * modify conn_recvancillary_size/add to
2716 		 * allow the user to see the label.
2717 		 */
2718 	}
2719 
2720 	/*
2721 	 * Check a filter for ICMPv6 types if needed.
2722 	 * Verify raw checksums if needed.
2723 	 */
2724 	mutex_enter(&connp->conn_lock);
2725 	if (icmp->icmp_filter != NULL) {
2726 		int type;
2727 
2728 		/* Assumes that IP has done the pullupmsg */
2729 		type = mp->b_rptr[ip_hdr_length];
2730 
2731 		ASSERT(mp->b_rptr + ip_hdr_length <= mp->b_wptr);
2732 		if (ICMP6_FILTER_WILLBLOCK(type, icmp->icmp_filter)) {
2733 			mutex_exit(&connp->conn_lock);
2734 			freemsg(mp);
2735 			return;
2736 		}
2737 	}
2738 	if (connp->conn_ixa->ixa_flags & IXAF_SET_RAW_CKSUM) {
2739 		/* Checksum */
2740 		uint16_t	*up;
2741 		uint32_t	sum;
2742 		int		remlen;
2743 
2744 		up = (uint16_t *)&ip6h->ip6_src;
2745 
2746 		remlen = msgdsize(mp) - ip_hdr_length;
2747 		sum = htons(connp->conn_proto + remlen)
2748 		    + up[0] + up[1] + up[2] + up[3]
2749 		    + up[4] + up[5] + up[6] + up[7]
2750 		    + up[8] + up[9] + up[10] + up[11]
2751 		    + up[12] + up[13] + up[14] + up[15];
2752 		sum = (sum & 0xffff) + (sum >> 16);
2753 		sum = IP_CSUM(mp, ip_hdr_length, sum);
2754 		if (sum != 0) {
2755 			/* IPv6 RAW checksum failed */
2756 			ip0dbg(("icmp_rput: RAW checksum failed %x\n", sum));
2757 			mutex_exit(&connp->conn_lock);
2758 			freemsg(mp);
2759 			BUMP_MIB(&is->is_rawip_mib, rawipInCksumErrs);
2760 			return;
2761 		}
2762 	}
2763 	mutex_exit(&connp->conn_lock);
2764 
2765 	udi_size = sizeof (struct T_unitdata_ind) + sizeof (sin6_t);
2766 
2767 	if (recv_ancillary.crb_all != 0) {
2768 		udi_size += conn_recvancillary_size(connp,
2769 		    recv_ancillary, ira, mp, &ipps);
2770 	}
2771 
2772 	mp1 = allocb(udi_size, BPRI_MED);
2773 	if (mp1 == NULL) {
2774 		freemsg(mp);
2775 		BUMP_MIB(&is->is_rawip_mib, rawipInErrors);
2776 		return;
2777 	}
2778 	mp1->b_cont = mp;
2779 	mp1->b_datap->db_type = M_PROTO;
2780 	tudi = (struct T_unitdata_ind *)mp1->b_rptr;
2781 	mp1->b_wptr = (uchar_t *)tudi + udi_size;
2782 	tudi->PRIM_type = T_UNITDATA_IND;
2783 	tudi->SRC_length = sizeof (sin6_t);
2784 	tudi->SRC_offset = sizeof (struct T_unitdata_ind);
2785 	tudi->OPT_offset = sizeof (struct T_unitdata_ind) + sizeof (sin6_t);
2786 	udi_size -= (sizeof (struct T_unitdata_ind) + sizeof (sin6_t));
2787 	tudi->OPT_length = udi_size;
2788 	sin6 = (sin6_t *)&tudi[1];
2789 	*sin6 = sin6_null;
2790 	sin6->sin6_port = 0;
2791 	sin6->sin6_family = AF_INET6;
2792 
2793 	sin6->sin6_addr = ip6h->ip6_src;
2794 	/* No sin6_flowinfo per API */
2795 	sin6->sin6_flowinfo = 0;
2796 	/* For link-scope pass up scope id */
2797 	if (IN6_IS_ADDR_LINKSCOPE(&ip6h->ip6_src))
2798 		sin6->sin6_scope_id = ira->ira_ruifindex;
2799 	else
2800 		sin6->sin6_scope_id = 0;
2801 	sin6->__sin6_src_id = ip_srcid_find_addr(&ip6h->ip6_dst,
2802 	    IPCL_ZONEID(connp), is->is_netstack);
2803 
2804 	if (udi_size != 0) {
2805 		conn_recvancillary_add(connp, recv_ancillary, ira,
2806 		    &ipps, (uchar_t *)&sin6[1], udi_size);
2807 	}
2808 
2809 	/* Skip all the IPv6 headers per API */
2810 	mp->b_rptr += ip_hdr_length;
2811 	pkt_len -= ip_hdr_length;
2812 
2813 deliver:
2814 	BUMP_MIB(&is->is_rawip_mib, rawipInDatagrams);
2815 	icmp_ulp_recv(connp, mp1, pkt_len);
2816 }
2817 
2818 /*
2819  * return SNMP stuff in buffer in mpdata. We don't hold any lock and report
2820  * information that can be changing beneath us.
2821  */
2822 mblk_t *
2823 icmp_snmp_get(queue_t *q, mblk_t *mpctl)
2824 {
2825 	mblk_t			*mpdata;
2826 	struct opthdr		*optp;
2827 	conn_t			*connp = Q_TO_CONN(q);
2828 	icmp_stack_t		*is = connp->conn_netstack->netstack_icmp;
2829 	mblk_t			*mp2ctl;
2830 
2831 	/*
2832 	 * make a copy of the original message
2833 	 */
2834 	mp2ctl = copymsg(mpctl);
2835 
2836 	if (mpctl == NULL ||
2837 	    (mpdata = mpctl->b_cont) == NULL) {
2838 		freemsg(mpctl);
2839 		freemsg(mp2ctl);
2840 		return (0);
2841 	}
2842 
2843 	/* fixed length structure for IPv4 and IPv6 counters */
2844 	optp = (struct opthdr *)&mpctl->b_rptr[sizeof (struct T_optmgmt_ack)];
2845 	optp->level = EXPER_RAWIP;
2846 	optp->name = 0;
2847 	(void) snmp_append_data(mpdata, (char *)&is->is_rawip_mib,
2848 	    sizeof (is->is_rawip_mib));
2849 	optp->len = msgdsize(mpdata);
2850 	qreply(q, mpctl);
2851 
2852 	return (mp2ctl);
2853 }
2854 
2855 /*
2856  * Return 0 if invalid set request, 1 otherwise, including non-rawip requests.
2857  * TODO:  If this ever actually tries to set anything, it needs to be
2858  * to do the appropriate locking.
2859  */
2860 /* ARGSUSED */
2861 int
2862 icmp_snmp_set(queue_t *q, t_scalar_t level, t_scalar_t name,
2863     uchar_t *ptr, int len)
2864 {
2865 	switch (level) {
2866 	case EXPER_RAWIP:
2867 		return (0);
2868 	default:
2869 		return (1);
2870 	}
2871 }
2872 
2873 /*
2874  * This routine creates a T_UDERROR_IND message and passes it upstream.
2875  * The address and options are copied from the T_UNITDATA_REQ message
2876  * passed in mp.  This message is freed.
2877  */
2878 static void
2879 icmp_ud_err(queue_t *q, mblk_t *mp, t_scalar_t err)
2880 {
2881 	struct T_unitdata_req *tudr;
2882 	mblk_t	*mp1;
2883 	uchar_t *destaddr;
2884 	t_scalar_t destlen;
2885 	uchar_t	*optaddr;
2886 	t_scalar_t optlen;
2887 
2888 	if ((mp->b_wptr < mp->b_rptr) ||
2889 	    (MBLKL(mp)) < sizeof (struct T_unitdata_req)) {
2890 		goto done;
2891 	}
2892 	tudr = (struct T_unitdata_req *)mp->b_rptr;
2893 	destaddr = mp->b_rptr + tudr->DEST_offset;
2894 	if (destaddr < mp->b_rptr || destaddr >= mp->b_wptr ||
2895 	    destaddr + tudr->DEST_length < mp->b_rptr ||
2896 	    destaddr + tudr->DEST_length > mp->b_wptr) {
2897 		goto done;
2898 	}
2899 	optaddr = mp->b_rptr + tudr->OPT_offset;
2900 	if (optaddr < mp->b_rptr || optaddr >= mp->b_wptr ||
2901 	    optaddr + tudr->OPT_length < mp->b_rptr ||
2902 	    optaddr + tudr->OPT_length > mp->b_wptr) {
2903 		goto done;
2904 	}
2905 	destlen = tudr->DEST_length;
2906 	optlen = tudr->OPT_length;
2907 
2908 	mp1 = mi_tpi_uderror_ind((char *)destaddr, destlen,
2909 	    (char *)optaddr, optlen, err);
2910 	if (mp1 != NULL)
2911 		qreply(q, mp1);
2912 
2913 done:
2914 	freemsg(mp);
2915 }
2916 
2917 static int
2918 rawip_do_unbind(conn_t *connp)
2919 {
2920 	icmp_t	*icmp = connp->conn_icmp;
2921 
2922 	mutex_enter(&connp->conn_lock);
2923 	/* If a bind has not been done, we can't unbind. */
2924 	if (icmp->icmp_state == TS_UNBND) {
2925 		mutex_exit(&connp->conn_lock);
2926 		return (-TOUTSTATE);
2927 	}
2928 	connp->conn_saddr_v6 = ipv6_all_zeros;
2929 	connp->conn_bound_addr_v6 = ipv6_all_zeros;
2930 	connp->conn_laddr_v6 = ipv6_all_zeros;
2931 	connp->conn_mcbc_bind = B_FALSE;
2932 	connp->conn_lport = 0;
2933 	connp->conn_fport = 0;
2934 	/* In case we were also connected */
2935 	connp->conn_faddr_v6 = ipv6_all_zeros;
2936 	connp->conn_v6lastdst = ipv6_all_zeros;
2937 
2938 	icmp->icmp_state = TS_UNBND;
2939 
2940 	(void) icmp_build_hdr_template(connp, &connp->conn_saddr_v6,
2941 	    &connp->conn_faddr_v6, connp->conn_flowinfo);
2942 	mutex_exit(&connp->conn_lock);
2943 
2944 	ip_unbind(connp);
2945 	return (0);
2946 }
2947 
2948 /*
2949  * This routine is called by icmp_wput to handle T_UNBIND_REQ messages.
2950  * After some error checking, the message is passed downstream to ip.
2951  */
2952 static void
2953 icmp_tpi_unbind(queue_t *q, mblk_t *mp)
2954 {
2955 	conn_t	*connp = Q_TO_CONN(q);
2956 	int	error;
2957 
2958 	ASSERT(mp->b_cont == NULL);
2959 	error = rawip_do_unbind(connp);
2960 	if (error) {
2961 		if (error < 0) {
2962 			icmp_err_ack(q, mp, -error, 0);
2963 		} else {
2964 			icmp_err_ack(q, mp, 0, error);
2965 		}
2966 		return;
2967 	}
2968 
2969 	/*
2970 	 * Convert mp into a T_OK_ACK
2971 	 */
2972 
2973 	mp = mi_tpi_ok_ack_alloc(mp);
2974 
2975 	/*
2976 	 * should not happen in practice... T_OK_ACK is smaller than the
2977 	 * original message.
2978 	 */
2979 	ASSERT(mp != NULL);
2980 	ASSERT(((struct T_ok_ack *)mp->b_rptr)->PRIM_type == T_OK_ACK);
2981 	qreply(q, mp);
2982 }
2983 
2984 /*
2985  * Process IPv4 packets that already include an IP header.
2986  * Used when IP_HDRINCL has been set (implicit for IPPROTO_RAW and
2987  * IPPROTO_IGMP).
2988  * In this case we ignore the address and any options in the T_UNITDATA_REQ.
2989  *
2990  * The packet is assumed to have a base (20 byte) IP header followed
2991  * by the upper-layer protocol. We include any IP_OPTIONS including a
2992  * CIPSO label but otherwise preserve the base IP header.
2993  */
2994 static int
2995 icmp_output_hdrincl(conn_t *connp, mblk_t *mp, cred_t *cr, pid_t pid)
2996 {
2997 	icmp_t		*icmp = connp->conn_icmp;
2998 	icmp_stack_t	*is = icmp->icmp_is;
2999 	ipha_t		iphas;
3000 	ipha_t		*ipha;
3001 	int		ip_hdr_length;
3002 	int		tp_hdr_len;
3003 	ip_xmit_attr_t	*ixa;
3004 	ip_pkt_t	*ipp;
3005 	in6_addr_t	v6src;
3006 	in6_addr_t	v6dst;
3007 	in6_addr_t	v6nexthop;
3008 	int		error;
3009 	boolean_t	do_ipsec;
3010 
3011 	/*
3012 	 * We need an exclusive copy of conn_ixa since the included IP
3013 	 * header could have any destination.
3014 	 * That copy has no pointers hence we
3015 	 * need to set them up once we've parsed the ancillary data.
3016 	 */
3017 	ixa = conn_get_ixa_exclusive(connp);
3018 	if (ixa == NULL) {
3019 		BUMP_MIB(&is->is_rawip_mib, rawipOutErrors);
3020 		freemsg(mp);
3021 		return (ENOMEM);
3022 	}
3023 	ASSERT(cr != NULL);
3024 	/*
3025 	 * Caller has a reference on cr; from db_credp or because we
3026 	 * are running in process context.
3027 	 */
3028 	ASSERT(!(ixa->ixa_free_flags & IXA_FREE_CRED));
3029 	ixa->ixa_cred = cr;
3030 	ixa->ixa_cpid = pid;
3031 	if (is_system_labeled()) {
3032 		/* We need to restart with a label based on the cred */
3033 		ip_xmit_attr_restore_tsl(ixa, ixa->ixa_cred);
3034 	}
3035 
3036 	/* In case previous destination was multicast or multirt */
3037 	ip_attr_newdst(ixa);
3038 
3039 	/* Get a copy of conn_xmit_ipp since the TX label might change it */
3040 	ipp = kmem_zalloc(sizeof (*ipp), KM_NOSLEEP);
3041 	if (ipp == NULL) {
3042 		ASSERT(!(ixa->ixa_free_flags & IXA_FREE_CRED));
3043 		ixa->ixa_cred = connp->conn_cred;	/* Restore */
3044 		ixa->ixa_cpid = connp->conn_cpid;
3045 		ixa_refrele(ixa);
3046 		BUMP_MIB(&is->is_rawip_mib, rawipOutErrors);
3047 		freemsg(mp);
3048 		return (ENOMEM);
3049 	}
3050 	mutex_enter(&connp->conn_lock);
3051 	error = ip_pkt_copy(&connp->conn_xmit_ipp, ipp, KM_NOSLEEP);
3052 	mutex_exit(&connp->conn_lock);
3053 	if (error != 0) {
3054 		BUMP_MIB(&is->is_rawip_mib, rawipOutErrors);
3055 		freemsg(mp);
3056 		goto done;
3057 	}
3058 
3059 	/* Sanity check length of packet */
3060 	ipha = (ipha_t *)mp->b_rptr;
3061 
3062 	ip_hdr_length = IP_SIMPLE_HDR_LENGTH;
3063 	if ((mp->b_wptr - mp->b_rptr) < IP_SIMPLE_HDR_LENGTH) {
3064 		if (!pullupmsg(mp, IP_SIMPLE_HDR_LENGTH)) {
3065 			BUMP_MIB(&is->is_rawip_mib, rawipOutErrors);
3066 			freemsg(mp);
3067 			goto done;
3068 		}
3069 		ipha = (ipha_t *)mp->b_rptr;
3070 	}
3071 	ipha->ipha_version_and_hdr_length =
3072 	    (IP_VERSION<<4) | (ip_hdr_length>>2);
3073 
3074 	/*
3075 	 * We set IXAF_DONTFRAG if the application set DF which makes
3076 	 * IP not fragment.
3077 	 */
3078 	ipha->ipha_fragment_offset_and_flags &= htons(IPH_DF);
3079 	if (ipha->ipha_fragment_offset_and_flags & htons(IPH_DF))
3080 		ixa->ixa_flags |= (IXAF_DONTFRAG | IXAF_PMTU_IPV4_DF);
3081 	else
3082 		ixa->ixa_flags &= ~(IXAF_DONTFRAG | IXAF_PMTU_IPV4_DF);
3083 
3084 	/* Even for multicast and broadcast we honor the apps ttl */
3085 	ixa->ixa_flags |= IXAF_NO_TTL_CHANGE;
3086 
3087 	/*
3088 	 * No source verification for non-local addresses
3089 	 */
3090 	if (ipha->ipha_src != INADDR_ANY &&
3091 	    ip_laddr_verify_v4(ipha->ipha_src, ixa->ixa_zoneid,
3092 	    is->is_netstack->netstack_ip, B_FALSE)
3093 	    != IPVL_UNICAST_UP) {
3094 		ixa->ixa_flags &= ~IXAF_VERIFY_SOURCE;
3095 	}
3096 
3097 	if (ipha->ipha_dst == INADDR_ANY)
3098 		ipha->ipha_dst = htonl(INADDR_LOOPBACK);
3099 
3100 	IN6_IPADDR_TO_V4MAPPED(ipha->ipha_src, &v6src);
3101 	IN6_IPADDR_TO_V4MAPPED(ipha->ipha_dst, &v6dst);
3102 
3103 	/* Defer IPsec if it might need to look at ICMP type/code */
3104 	do_ipsec = ipha->ipha_protocol != IPPROTO_ICMP;
3105 	ixa->ixa_flags |= IXAF_IS_IPV4;
3106 
3107 	ip_attr_nexthop(ipp, ixa, &v6dst, &v6nexthop);
3108 	error = ip_attr_connect(connp, ixa, &v6src, &v6dst, &v6nexthop,
3109 	    connp->conn_fport, &v6src, NULL, IPDF_ALLOW_MCBC | IPDF_VERIFY_DST |
3110 	    (do_ipsec ? IPDF_IPSEC : 0));
3111 	switch (error) {
3112 	case 0:
3113 		break;
3114 	case EADDRNOTAVAIL:
3115 		/*
3116 		 * IXAF_VERIFY_SOURCE tells us to pick a better source.
3117 		 * Don't have the application see that errno
3118 		 */
3119 		error = ENETUNREACH;
3120 		goto failed;
3121 	case ENETDOWN:
3122 		/*
3123 		 * Have !ipif_addr_ready address; drop packet silently
3124 		 * until we can get applications to not send until we
3125 		 * are ready.
3126 		 */
3127 		error = 0;
3128 		goto failed;
3129 	case EHOSTUNREACH:
3130 	case ENETUNREACH:
3131 		if (ixa->ixa_ire != NULL) {
3132 			/*
3133 			 * Let conn_ip_output/ire_send_noroute return
3134 			 * the error and send any local ICMP error.
3135 			 */
3136 			error = 0;
3137 			break;
3138 		}
3139 		/* FALLTHRU */
3140 	default:
3141 	failed:
3142 		freemsg(mp);
3143 		BUMP_MIB(&is->is_rawip_mib, rawipOutErrors);
3144 		goto done;
3145 	}
3146 	if (ipha->ipha_src == INADDR_ANY)
3147 		IN6_V4MAPPED_TO_IPADDR(&v6src, ipha->ipha_src);
3148 
3149 	/*
3150 	 * We might be going to a different destination than last time,
3151 	 * thus check that TX allows the communication and compute any
3152 	 * needed label.
3153 	 *
3154 	 * TSOL Note: We have an exclusive ipp and ixa for this thread so we
3155 	 * don't have to worry about concurrent threads.
3156 	 */
3157 	if (is_system_labeled()) {
3158 		/*
3159 		 * Check whether Trusted Solaris policy allows communication
3160 		 * with this host, and pretend that the destination is
3161 		 * unreachable if not.
3162 		 * Compute any needed label and place it in ipp_label_v4/v6.
3163 		 *
3164 		 * Later conn_build_hdr_template/conn_prepend_hdr takes
3165 		 * ipp_label_v4/v6 to form the packet.
3166 		 *
3167 		 * Tsol note: We have ipp structure local to this thread so
3168 		 * no locking is needed.
3169 		 */
3170 		error = conn_update_label(connp, ixa, &v6dst, ipp);
3171 		if (error != 0) {
3172 			freemsg(mp);
3173 			BUMP_MIB(&is->is_rawip_mib, rawipOutErrors);
3174 			goto done;
3175 		}
3176 	}
3177 
3178 	/*
3179 	 * Save away a copy of the IPv4 header the application passed down
3180 	 * and then prepend an IPv4 header complete with any IP options
3181 	 * including label.
3182 	 * We need a struct copy since icmp_prepend_hdr will reuse the available
3183 	 * space in the mblk.
3184 	 */
3185 	iphas = *ipha;
3186 	mp->b_rptr += IP_SIMPLE_HDR_LENGTH;
3187 
3188 	mp = icmp_prepend_hdr(connp, ixa, ipp, &v6src, &v6dst, 0, mp, &error);
3189 	if (mp == NULL) {
3190 		BUMP_MIB(&is->is_rawip_mib, rawipOutErrors);
3191 		ASSERT(error != 0);
3192 		goto done;
3193 	}
3194 	if (ixa->ixa_pktlen > IP_MAXPACKET) {
3195 		error = EMSGSIZE;
3196 		BUMP_MIB(&is->is_rawip_mib, rawipOutErrors);
3197 		freemsg(mp);
3198 		goto done;
3199 	}
3200 	/* Restore key parts of the header that the application passed down */
3201 	ipha = (ipha_t *)mp->b_rptr;
3202 	ipha->ipha_type_of_service = iphas.ipha_type_of_service;
3203 	ipha->ipha_ident = iphas.ipha_ident;
3204 	ipha->ipha_fragment_offset_and_flags =
3205 	    iphas.ipha_fragment_offset_and_flags;
3206 	ipha->ipha_ttl = iphas.ipha_ttl;
3207 	ipha->ipha_protocol = iphas.ipha_protocol;
3208 	ipha->ipha_src = iphas.ipha_src;
3209 	ipha->ipha_dst = iphas.ipha_dst;
3210 
3211 	ixa->ixa_protocol = ipha->ipha_protocol;
3212 
3213 	/*
3214 	 * Make sure that the IP header plus any transport header that is
3215 	 * checksumed by ip_output is in the first mblk. (ip_output assumes
3216 	 * that at least the checksum field is in the first mblk.)
3217 	 */
3218 	switch (ipha->ipha_protocol) {
3219 	case IPPROTO_UDP:
3220 		tp_hdr_len = 8;
3221 		break;
3222 	case IPPROTO_TCP:
3223 		tp_hdr_len = 20;
3224 		break;
3225 	default:
3226 		tp_hdr_len = 0;
3227 		break;
3228 	}
3229 	ip_hdr_length = IPH_HDR_LENGTH(ipha);
3230 	if (mp->b_wptr - mp->b_rptr < ip_hdr_length + tp_hdr_len) {
3231 		if (!pullupmsg(mp, ip_hdr_length + tp_hdr_len)) {
3232 			BUMP_MIB(&is->is_rawip_mib, rawipOutErrors);
3233 			if (mp->b_cont == NULL)
3234 				error = EINVAL;
3235 			else
3236 				error = ENOMEM;
3237 			freemsg(mp);
3238 			goto done;
3239 		}
3240 	}
3241 
3242 	if (!do_ipsec) {
3243 		/* Policy might differ for different ICMP type/code */
3244 		if (ixa->ixa_ipsec_policy != NULL) {
3245 			IPPOL_REFRELE(ixa->ixa_ipsec_policy);
3246 			ixa->ixa_ipsec_policy = NULL;
3247 			ixa->ixa_flags &= ~IXAF_IPSEC_SECURE;
3248 		}
3249 		mp = ip_output_attach_policy(mp, ipha, NULL, connp, ixa);
3250 		if (mp == NULL) {
3251 			BUMP_MIB(&is->is_rawip_mib, rawipOutErrors);
3252 			error = EHOSTUNREACH;	/* IPsec policy failure */
3253 			goto done;
3254 		}
3255 	}
3256 
3257 	/* We're done.  Pass the packet to ip. */
3258 	BUMP_MIB(&is->is_rawip_mib, rawipOutDatagrams);
3259 
3260 	error = conn_ip_output(mp, ixa);
3261 	/* No rawipOutErrors if an error since IP increases its error counter */
3262 	switch (error) {
3263 	case 0:
3264 		break;
3265 	case EWOULDBLOCK:
3266 		(void) ixa_check_drain_insert(connp, ixa);
3267 		error = 0;
3268 		break;
3269 	case EADDRNOTAVAIL:
3270 		/*
3271 		 * IXAF_VERIFY_SOURCE tells us to pick a better source.
3272 		 * Don't have the application see that errno
3273 		 */
3274 		error = ENETUNREACH;
3275 		break;
3276 	}
3277 done:
3278 	ASSERT(!(ixa->ixa_free_flags & IXA_FREE_CRED));
3279 	ixa->ixa_cred = connp->conn_cred;	/* Restore */
3280 	ixa->ixa_cpid = connp->conn_cpid;
3281 	ixa_refrele(ixa);
3282 	ip_pkt_free(ipp);
3283 	kmem_free(ipp, sizeof (*ipp));
3284 	return (error);
3285 }
3286 
3287 static mblk_t *
3288 icmp_output_attach_policy(mblk_t *mp, conn_t *connp, ip_xmit_attr_t *ixa)
3289 {
3290 	ipha_t	*ipha = NULL;
3291 	ip6_t	*ip6h = NULL;
3292 
3293 	if (ixa->ixa_flags & IXAF_IS_IPV4)
3294 		ipha = (ipha_t *)mp->b_rptr;
3295 	else
3296 		ip6h = (ip6_t *)mp->b_rptr;
3297 
3298 	if (ixa->ixa_ipsec_policy != NULL) {
3299 		IPPOL_REFRELE(ixa->ixa_ipsec_policy);
3300 		ixa->ixa_ipsec_policy = NULL;
3301 		ixa->ixa_flags &= ~IXAF_IPSEC_SECURE;
3302 	}
3303 	return (ip_output_attach_policy(mp, ipha, ip6h, connp, ixa));
3304 }
3305 
3306 /*
3307  * Handle T_UNITDATA_REQ with options. Both IPv4 and IPv6
3308  * Either tudr_mp or msg is set. If tudr_mp we take ancillary data from
3309  * the TPI options, otherwise we take them from msg_control.
3310  * If both sin and sin6 is set it is a connected socket and we use conn_faddr.
3311  * Always consumes mp; never consumes tudr_mp.
3312  */
3313 static int
3314 icmp_output_ancillary(conn_t *connp, sin_t *sin, sin6_t *sin6, mblk_t *mp,
3315     mblk_t *tudr_mp, struct nmsghdr *msg, cred_t *cr, pid_t pid)
3316 {
3317 	icmp_t		*icmp = connp->conn_icmp;
3318 	icmp_stack_t	*is = icmp->icmp_is;
3319 	int		error;
3320 	ip_xmit_attr_t	*ixa;
3321 	ip_pkt_t	*ipp;
3322 	in6_addr_t	v6src;
3323 	in6_addr_t	v6dst;
3324 	in6_addr_t	v6nexthop;
3325 	in_port_t	dstport;
3326 	uint32_t	flowinfo;
3327 	uint_t		srcid;
3328 	int		is_absreq_failure = 0;
3329 	conn_opt_arg_t	coas, *coa;
3330 
3331 	ASSERT(tudr_mp != NULL || msg != NULL);
3332 
3333 	/*
3334 	 * Get ixa before checking state to handle a disconnect race.
3335 	 *
3336 	 * We need an exclusive copy of conn_ixa since the ancillary data
3337 	 * options might modify it. That copy has no pointers hence we
3338 	 * need to set them up once we've parsed the ancillary data.
3339 	 */
3340 	ixa = conn_get_ixa_exclusive(connp);
3341 	if (ixa == NULL) {
3342 		BUMP_MIB(&is->is_rawip_mib, rawipOutErrors);
3343 		freemsg(mp);
3344 		return (ENOMEM);
3345 	}
3346 	ASSERT(cr != NULL);
3347 	ASSERT(!(ixa->ixa_free_flags & IXA_FREE_CRED));
3348 	ixa->ixa_cred = cr;
3349 	ixa->ixa_cpid = pid;
3350 	if (is_system_labeled()) {
3351 		/* We need to restart with a label based on the cred */
3352 		ip_xmit_attr_restore_tsl(ixa, ixa->ixa_cred);
3353 	}
3354 
3355 	/* In case previous destination was multicast or multirt */
3356 	ip_attr_newdst(ixa);
3357 
3358 	/* Get a copy of conn_xmit_ipp since the options might change it */
3359 	ipp = kmem_zalloc(sizeof (*ipp), KM_NOSLEEP);
3360 	if (ipp == NULL) {
3361 		ASSERT(!(ixa->ixa_free_flags & IXA_FREE_CRED));
3362 		ixa->ixa_cred = connp->conn_cred;	/* Restore */
3363 		ixa->ixa_cpid = connp->conn_cpid;
3364 		ixa_refrele(ixa);
3365 		BUMP_MIB(&is->is_rawip_mib, rawipOutErrors);
3366 		freemsg(mp);
3367 		return (ENOMEM);
3368 	}
3369 	mutex_enter(&connp->conn_lock);
3370 	error = ip_pkt_copy(&connp->conn_xmit_ipp, ipp, KM_NOSLEEP);
3371 	mutex_exit(&connp->conn_lock);
3372 	if (error != 0) {
3373 		BUMP_MIB(&is->is_rawip_mib, rawipOutErrors);
3374 		freemsg(mp);
3375 		goto done;
3376 	}
3377 
3378 	/*
3379 	 * Parse the options and update ixa and ipp as a result.
3380 	 */
3381 
3382 	coa = &coas;
3383 	coa->coa_connp = connp;
3384 	coa->coa_ixa = ixa;
3385 	coa->coa_ipp = ipp;
3386 	coa->coa_ancillary = B_TRUE;
3387 	coa->coa_changed = 0;
3388 
3389 	if (msg != NULL) {
3390 		error = process_auxiliary_options(connp, msg->msg_control,
3391 		    msg->msg_controllen, coa, &icmp_opt_obj, icmp_opt_set, cr);
3392 	} else {
3393 		struct T_unitdata_req *tudr;
3394 
3395 		tudr = (struct T_unitdata_req *)tudr_mp->b_rptr;
3396 		ASSERT(tudr->PRIM_type == T_UNITDATA_REQ);
3397 		error = tpi_optcom_buf(connp->conn_wq, tudr_mp,
3398 		    &tudr->OPT_length, tudr->OPT_offset, cr, &icmp_opt_obj,
3399 		    coa, &is_absreq_failure);
3400 	}
3401 	if (error != 0) {
3402 		/*
3403 		 * Note: No special action needed in this
3404 		 * module for "is_absreq_failure"
3405 		 */
3406 		freemsg(mp);
3407 		BUMP_MIB(&is->is_rawip_mib, rawipOutErrors);
3408 		goto done;
3409 	}
3410 	ASSERT(is_absreq_failure == 0);
3411 
3412 	mutex_enter(&connp->conn_lock);
3413 	/*
3414 	 * If laddr is unspecified then we look at sin6_src_id.
3415 	 * We will give precedence to a source address set with IPV6_PKTINFO
3416 	 * (aka IPPF_ADDR) but that is handled in build_hdrs. However, we don't
3417 	 * want ip_attr_connect to select a source (since it can fail) when
3418 	 * IPV6_PKTINFO is specified.
3419 	 * If this doesn't result in a source address then we get a source
3420 	 * from ip_attr_connect() below.
3421 	 */
3422 	v6src = connp->conn_saddr_v6;
3423 	if (sin != NULL) {
3424 		IN6_IPADDR_TO_V4MAPPED(sin->sin_addr.s_addr, &v6dst);
3425 		dstport = sin->sin_port;
3426 		flowinfo = 0;
3427 		ixa->ixa_flags &= ~IXAF_SCOPEID_SET;
3428 		ixa->ixa_flags |= IXAF_IS_IPV4;
3429 	} else if (sin6 != NULL) {
3430 		v6dst = sin6->sin6_addr;
3431 		dstport = sin6->sin6_port;
3432 		flowinfo = sin6->sin6_flowinfo;
3433 		srcid = sin6->__sin6_src_id;
3434 		if (IN6_IS_ADDR_LINKSCOPE(&v6dst) && sin6->sin6_scope_id != 0) {
3435 			ixa->ixa_scopeid = sin6->sin6_scope_id;
3436 			ixa->ixa_flags |= IXAF_SCOPEID_SET;
3437 		} else {
3438 			ixa->ixa_flags &= ~IXAF_SCOPEID_SET;
3439 		}
3440 		if (srcid != 0 && IN6_IS_ADDR_UNSPECIFIED(&v6src)) {
3441 			ip_srcid_find_id(srcid, &v6src, IPCL_ZONEID(connp),
3442 			    connp->conn_netstack);
3443 		}
3444 		if (IN6_IS_ADDR_V4MAPPED(&v6dst))
3445 			ixa->ixa_flags |= IXAF_IS_IPV4;
3446 		else
3447 			ixa->ixa_flags &= ~IXAF_IS_IPV4;
3448 	} else {
3449 		/* Connected case */
3450 		v6dst = connp->conn_faddr_v6;
3451 		flowinfo = connp->conn_flowinfo;
3452 	}
3453 	mutex_exit(&connp->conn_lock);
3454 	/* Handle IP_PKTINFO/IPV6_PKTINFO setting source address. */
3455 	if (ipp->ipp_fields & IPPF_ADDR) {
3456 		if (ixa->ixa_flags & IXAF_IS_IPV4) {
3457 			if (IN6_IS_ADDR_V4MAPPED(&ipp->ipp_addr))
3458 				v6src = ipp->ipp_addr;
3459 		} else {
3460 			if (!IN6_IS_ADDR_V4MAPPED(&ipp->ipp_addr))
3461 				v6src = ipp->ipp_addr;
3462 		}
3463 	}
3464 	/*
3465 	 * Allow source not assigned to the system
3466 	 * only if it is not a local addresses
3467 	 */
3468 	if (!V6_OR_V4_INADDR_ANY(v6src)) {
3469 		ip_laddr_t laddr_type;
3470 
3471 		if (ixa->ixa_flags & IXAF_IS_IPV4) {
3472 			ipaddr_t v4src;
3473 
3474 			IN6_V4MAPPED_TO_IPADDR(&v6src, v4src);
3475 			laddr_type = ip_laddr_verify_v4(v4src, ixa->ixa_zoneid,
3476 			    is->is_netstack->netstack_ip, B_FALSE);
3477 		} else {
3478 			laddr_type = ip_laddr_verify_v6(&v6src, ixa->ixa_zoneid,
3479 			    is->is_netstack->netstack_ip, B_FALSE, B_FALSE);
3480 		}
3481 		if (laddr_type != IPVL_UNICAST_UP)
3482 			ixa->ixa_flags &= ~IXAF_VERIFY_SOURCE;
3483 	}
3484 
3485 	ip_attr_nexthop(ipp, ixa, &v6dst, &v6nexthop);
3486 	error = ip_attr_connect(connp, ixa, &v6src, &v6dst, &v6nexthop, dstport,
3487 	    &v6src, NULL, IPDF_ALLOW_MCBC | IPDF_VERIFY_DST);
3488 
3489 	switch (error) {
3490 	case 0:
3491 		break;
3492 	case EADDRNOTAVAIL:
3493 		/*
3494 		 * IXAF_VERIFY_SOURCE tells us to pick a better source.
3495 		 * Don't have the application see that errno
3496 		 */
3497 		error = ENETUNREACH;
3498 		goto failed;
3499 	case ENETDOWN:
3500 		/*
3501 		 * Have !ipif_addr_ready address; drop packet silently
3502 		 * until we can get applications to not send until we
3503 		 * are ready.
3504 		 */
3505 		error = 0;
3506 		goto failed;
3507 	case EHOSTUNREACH:
3508 	case ENETUNREACH:
3509 		if (ixa->ixa_ire != NULL) {
3510 			/*
3511 			 * Let conn_ip_output/ire_send_noroute return
3512 			 * the error and send any local ICMP error.
3513 			 */
3514 			error = 0;
3515 			break;
3516 		}
3517 		/* FALLTHRU */
3518 	default:
3519 	failed:
3520 		freemsg(mp);
3521 		BUMP_MIB(&is->is_rawip_mib, rawipOutErrors);
3522 		goto done;
3523 	}
3524 
3525 	/*
3526 	 * We might be going to a different destination than last time,
3527 	 * thus check that TX allows the communication and compute any
3528 	 * needed label.
3529 	 *
3530 	 * TSOL Note: We have an exclusive ipp and ixa for this thread so we
3531 	 * don't have to worry about concurrent threads.
3532 	 */
3533 	if (is_system_labeled()) {
3534 		/*
3535 		 * Check whether Trusted Solaris policy allows communication
3536 		 * with this host, and pretend that the destination is
3537 		 * unreachable if not.
3538 		 * Compute any needed label and place it in ipp_label_v4/v6.
3539 		 *
3540 		 * Later conn_build_hdr_template/conn_prepend_hdr takes
3541 		 * ipp_label_v4/v6 to form the packet.
3542 		 *
3543 		 * Tsol note: We have ipp structure local to this thread so
3544 		 * no locking is needed.
3545 		 */
3546 		error = conn_update_label(connp, ixa, &v6dst, ipp);
3547 		if (error != 0) {
3548 			freemsg(mp);
3549 			BUMP_MIB(&is->is_rawip_mib, rawipOutErrors);
3550 			goto done;
3551 		}
3552 	}
3553 	mp = icmp_prepend_hdr(connp, ixa, ipp, &v6src, &v6dst, flowinfo, mp,
3554 	    &error);
3555 	if (mp == NULL) {
3556 		BUMP_MIB(&is->is_rawip_mib, rawipOutErrors);
3557 		ASSERT(error != 0);
3558 		goto done;
3559 	}
3560 	if (ixa->ixa_pktlen > IP_MAXPACKET) {
3561 		error = EMSGSIZE;
3562 		BUMP_MIB(&is->is_rawip_mib, rawipOutErrors);
3563 		freemsg(mp);
3564 		goto done;
3565 	}
3566 
3567 	/* Policy might differ for different ICMP type/code */
3568 	mp = icmp_output_attach_policy(mp, connp, ixa);
3569 	if (mp == NULL) {
3570 		BUMP_MIB(&is->is_rawip_mib, rawipOutErrors);
3571 		error = EHOSTUNREACH;	/* IPsec policy failure */
3572 		goto done;
3573 	}
3574 
3575 	/* We're done.  Pass the packet to ip. */
3576 	BUMP_MIB(&is->is_rawip_mib, rawipOutDatagrams);
3577 
3578 	error = conn_ip_output(mp, ixa);
3579 	if (!connp->conn_unspec_src)
3580 		ixa->ixa_flags |= IXAF_VERIFY_SOURCE;
3581 	/* No rawipOutErrors if an error since IP increases its error counter */
3582 	switch (error) {
3583 	case 0:
3584 		break;
3585 	case EWOULDBLOCK:
3586 		(void) ixa_check_drain_insert(connp, ixa);
3587 		error = 0;
3588 		break;
3589 	case EADDRNOTAVAIL:
3590 		/*
3591 		 * IXAF_VERIFY_SOURCE tells us to pick a better source.
3592 		 * Don't have the application see that errno
3593 		 */
3594 		error = ENETUNREACH;
3595 		/* FALLTHRU */
3596 	default:
3597 		mutex_enter(&connp->conn_lock);
3598 		/*
3599 		 * Clear the source and v6lastdst so we call ip_attr_connect
3600 		 * for the next packet and try to pick a better source.
3601 		 */
3602 		if (connp->conn_mcbc_bind)
3603 			connp->conn_saddr_v6 = ipv6_all_zeros;
3604 		else
3605 			connp->conn_saddr_v6 = connp->conn_bound_addr_v6;
3606 		connp->conn_v6lastdst = ipv6_all_zeros;
3607 		mutex_exit(&connp->conn_lock);
3608 		break;
3609 	}
3610 done:
3611 	ASSERT(!(ixa->ixa_free_flags & IXA_FREE_CRED));
3612 	ixa->ixa_cred = connp->conn_cred;	/* Restore */
3613 	ixa->ixa_cpid = connp->conn_cpid;
3614 	ixa_refrele(ixa);
3615 	ip_pkt_free(ipp);
3616 	kmem_free(ipp, sizeof (*ipp));
3617 	return (error);
3618 }
3619 
3620 /*
3621  * Handle sending an M_DATA for a connected socket.
3622  * Handles both IPv4 and IPv6.
3623  */
3624 int
3625 icmp_output_connected(conn_t *connp, mblk_t *mp, cred_t *cr, pid_t pid)
3626 {
3627 	icmp_t		*icmp = connp->conn_icmp;
3628 	icmp_stack_t	*is = icmp->icmp_is;
3629 	int		error;
3630 	ip_xmit_attr_t	*ixa;
3631 	boolean_t	do_ipsec;
3632 
3633 	/*
3634 	 * If no other thread is using conn_ixa this just gets a reference to
3635 	 * conn_ixa. Otherwise we get a safe copy of conn_ixa.
3636 	 */
3637 	ixa = conn_get_ixa(connp, B_FALSE);
3638 	if (ixa == NULL) {
3639 		BUMP_MIB(&is->is_rawip_mib, rawipOutErrors);
3640 		freemsg(mp);
3641 		return (ENOMEM);
3642 	}
3643 
3644 	ASSERT(cr != NULL);
3645 	ASSERT(!(ixa->ixa_free_flags & IXA_FREE_CRED));
3646 	ixa->ixa_cred = cr;
3647 	ixa->ixa_cpid = pid;
3648 
3649 	/* Defer IPsec if it might need to look at ICMP type/code */
3650 	switch (ixa->ixa_protocol) {
3651 	case IPPROTO_ICMP:
3652 	case IPPROTO_ICMPV6:
3653 		do_ipsec = B_FALSE;
3654 		break;
3655 	default:
3656 		do_ipsec = B_TRUE;
3657 	}
3658 
3659 	mutex_enter(&connp->conn_lock);
3660 	mp = icmp_prepend_header_template(connp, ixa, mp,
3661 	    &connp->conn_saddr_v6, connp->conn_flowinfo, &error);
3662 
3663 	if (mp == NULL) {
3664 		ASSERT(error != 0);
3665 		mutex_exit(&connp->conn_lock);
3666 		ASSERT(!(ixa->ixa_free_flags & IXA_FREE_CRED));
3667 		ixa->ixa_cred = connp->conn_cred;	/* Restore */
3668 		ixa->ixa_cpid = connp->conn_cpid;
3669 		ixa_refrele(ixa);
3670 		BUMP_MIB(&is->is_rawip_mib, rawipOutErrors);
3671 		freemsg(mp);
3672 		return (error);
3673 	}
3674 
3675 	if (!do_ipsec) {
3676 		/* Policy might differ for different ICMP type/code */
3677 		mp = icmp_output_attach_policy(mp, connp, ixa);
3678 		if (mp == NULL) {
3679 			mutex_exit(&connp->conn_lock);
3680 			BUMP_MIB(&is->is_rawip_mib, rawipOutErrors);
3681 			ASSERT(!(ixa->ixa_free_flags & IXA_FREE_CRED));
3682 			ixa->ixa_cred = connp->conn_cred;	/* Restore */
3683 			ixa->ixa_cpid = connp->conn_cpid;
3684 			ixa_refrele(ixa);
3685 			return (EHOSTUNREACH);	/* IPsec policy failure */
3686 		}
3687 	}
3688 
3689 	/*
3690 	 * In case we got a safe copy of conn_ixa, or if opt_set made us a new
3691 	 * safe copy, then we need to fill in any pointers in it.
3692 	 */
3693 	if (ixa->ixa_ire == NULL) {
3694 		in6_addr_t	faddr, saddr;
3695 		in6_addr_t	nexthop;
3696 		in_port_t	fport;
3697 
3698 		saddr = connp->conn_saddr_v6;
3699 		faddr = connp->conn_faddr_v6;
3700 		fport = connp->conn_fport;
3701 		ip_attr_nexthop(&connp->conn_xmit_ipp, ixa, &faddr, &nexthop);
3702 		mutex_exit(&connp->conn_lock);
3703 
3704 		error = ip_attr_connect(connp, ixa, &saddr, &faddr, &nexthop,
3705 		    fport, NULL, NULL, IPDF_ALLOW_MCBC | IPDF_VERIFY_DST |
3706 		    (do_ipsec ? IPDF_IPSEC : 0));
3707 		switch (error) {
3708 		case 0:
3709 			break;
3710 		case EADDRNOTAVAIL:
3711 			/*
3712 			 * IXAF_VERIFY_SOURCE tells us to pick a better source.
3713 			 * Don't have the application see that errno
3714 			 */
3715 			error = ENETUNREACH;
3716 			goto failed;
3717 		case ENETDOWN:
3718 			/*
3719 			 * Have !ipif_addr_ready address; drop packet silently
3720 			 * until we can get applications to not send until we
3721 			 * are ready.
3722 			 */
3723 			error = 0;
3724 			goto failed;
3725 		case EHOSTUNREACH:
3726 		case ENETUNREACH:
3727 			if (ixa->ixa_ire != NULL) {
3728 				/*
3729 				 * Let conn_ip_output/ire_send_noroute return
3730 				 * the error and send any local ICMP error.
3731 				 */
3732 				error = 0;
3733 				break;
3734 			}
3735 			/* FALLTHRU */
3736 		default:
3737 		failed:
3738 			ASSERT(!(ixa->ixa_free_flags & IXA_FREE_CRED));
3739 			ixa->ixa_cred = connp->conn_cred;	/* Restore */
3740 			ixa->ixa_cpid = connp->conn_cpid;
3741 			ixa_refrele(ixa);
3742 			BUMP_MIB(&is->is_rawip_mib, rawipOutErrors);
3743 			freemsg(mp);
3744 			return (error);
3745 		}
3746 	} else {
3747 		/* Done with conn_t */
3748 		mutex_exit(&connp->conn_lock);
3749 	}
3750 
3751 	/* We're done.  Pass the packet to ip. */
3752 	BUMP_MIB(&is->is_rawip_mib, rawipOutDatagrams);
3753 
3754 	error = conn_ip_output(mp, ixa);
3755 	/* No rawipOutErrors if an error since IP increases its error counter */
3756 	switch (error) {
3757 	case 0:
3758 		break;
3759 	case EWOULDBLOCK:
3760 		(void) ixa_check_drain_insert(connp, ixa);
3761 		error = 0;
3762 		break;
3763 	case EADDRNOTAVAIL:
3764 		/*
3765 		 * IXAF_VERIFY_SOURCE tells us to pick a better source.
3766 		 * Don't have the application see that errno
3767 		 */
3768 		error = ENETUNREACH;
3769 		break;
3770 	}
3771 	ASSERT(!(ixa->ixa_free_flags & IXA_FREE_CRED));
3772 	ixa->ixa_cred = connp->conn_cred;	/* Restore */
3773 	ixa->ixa_cpid = connp->conn_cpid;
3774 	ixa_refrele(ixa);
3775 	return (error);
3776 }
3777 
3778 /*
3779  * Handle sending an M_DATA to the last destination.
3780  * Handles both IPv4 and IPv6.
3781  *
3782  * NOTE: The caller must hold conn_lock and we drop it here.
3783  */
3784 int
3785 icmp_output_lastdst(conn_t *connp, mblk_t *mp, cred_t *cr, pid_t pid,
3786     ip_xmit_attr_t *ixa)
3787 {
3788 	icmp_t		*icmp = connp->conn_icmp;
3789 	icmp_stack_t	*is = icmp->icmp_is;
3790 	int		error;
3791 	boolean_t	do_ipsec;
3792 
3793 	ASSERT(MUTEX_HELD(&connp->conn_lock));
3794 	ASSERT(ixa != NULL);
3795 
3796 	ASSERT(cr != NULL);
3797 	ASSERT(!(ixa->ixa_free_flags & IXA_FREE_CRED));
3798 	ixa->ixa_cred = cr;
3799 	ixa->ixa_cpid = pid;
3800 
3801 	/* Defer IPsec if it might need to look at ICMP type/code */
3802 	switch (ixa->ixa_protocol) {
3803 	case IPPROTO_ICMP:
3804 	case IPPROTO_ICMPV6:
3805 		do_ipsec = B_FALSE;
3806 		break;
3807 	default:
3808 		do_ipsec = B_TRUE;
3809 	}
3810 
3811 
3812 	mp = icmp_prepend_header_template(connp, ixa, mp,
3813 	    &connp->conn_v6lastsrc, connp->conn_lastflowinfo, &error);
3814 
3815 	if (mp == NULL) {
3816 		ASSERT(error != 0);
3817 		mutex_exit(&connp->conn_lock);
3818 		ASSERT(!(ixa->ixa_free_flags & IXA_FREE_CRED));
3819 		ixa->ixa_cred = connp->conn_cred;	/* Restore */
3820 		ixa->ixa_cpid = connp->conn_cpid;
3821 		ixa_refrele(ixa);
3822 		BUMP_MIB(&is->is_rawip_mib, rawipOutErrors);
3823 		freemsg(mp);
3824 		return (error);
3825 	}
3826 
3827 	if (!do_ipsec) {
3828 		/* Policy might differ for different ICMP type/code */
3829 		mp = icmp_output_attach_policy(mp, connp, ixa);
3830 		if (mp == NULL) {
3831 			mutex_exit(&connp->conn_lock);
3832 			BUMP_MIB(&is->is_rawip_mib, rawipOutErrors);
3833 			ASSERT(!(ixa->ixa_free_flags & IXA_FREE_CRED));
3834 			ixa->ixa_cred = connp->conn_cred;	/* Restore */
3835 			ixa->ixa_cpid = connp->conn_cpid;
3836 			ixa_refrele(ixa);
3837 			return (EHOSTUNREACH);	/* IPsec policy failure */
3838 		}
3839 	}
3840 
3841 	/*
3842 	 * In case we got a safe copy of conn_ixa, or if opt_set made us a new
3843 	 * safe copy, then we need to fill in any pointers in it.
3844 	 */
3845 	if (ixa->ixa_ire == NULL) {
3846 		in6_addr_t	lastdst, lastsrc;
3847 		in6_addr_t	nexthop;
3848 		in_port_t	lastport;
3849 
3850 		lastsrc = connp->conn_v6lastsrc;
3851 		lastdst = connp->conn_v6lastdst;
3852 		lastport = connp->conn_lastdstport;
3853 		ip_attr_nexthop(&connp->conn_xmit_ipp, ixa, &lastdst, &nexthop);
3854 		mutex_exit(&connp->conn_lock);
3855 
3856 		error = ip_attr_connect(connp, ixa, &lastsrc, &lastdst,
3857 		    &nexthop, lastport, NULL, NULL, IPDF_ALLOW_MCBC |
3858 		    IPDF_VERIFY_DST | (do_ipsec ? IPDF_IPSEC : 0));
3859 		switch (error) {
3860 		case 0:
3861 			break;
3862 		case EADDRNOTAVAIL:
3863 			/*
3864 			 * IXAF_VERIFY_SOURCE tells us to pick a better source.
3865 			 * Don't have the application see that errno
3866 			 */
3867 			error = ENETUNREACH;
3868 			goto failed;
3869 		case ENETDOWN:
3870 			/*
3871 			 * Have !ipif_addr_ready address; drop packet silently
3872 			 * until we can get applications to not send until we
3873 			 * are ready.
3874 			 */
3875 			error = 0;
3876 			goto failed;
3877 		case EHOSTUNREACH:
3878 		case ENETUNREACH:
3879 			if (ixa->ixa_ire != NULL) {
3880 				/*
3881 				 * Let conn_ip_output/ire_send_noroute return
3882 				 * the error and send any local ICMP error.
3883 				 */
3884 				error = 0;
3885 				break;
3886 			}
3887 			/* FALLTHRU */
3888 		default:
3889 		failed:
3890 			ASSERT(!(ixa->ixa_free_flags & IXA_FREE_CRED));
3891 			ixa->ixa_cred = connp->conn_cred;	/* Restore */
3892 			ixa->ixa_cpid = connp->conn_cpid;
3893 			ixa_refrele(ixa);
3894 			BUMP_MIB(&is->is_rawip_mib, rawipOutErrors);
3895 			freemsg(mp);
3896 			return (error);
3897 		}
3898 	} else {
3899 		/* Done with conn_t */
3900 		mutex_exit(&connp->conn_lock);
3901 	}
3902 
3903 	/* We're done.  Pass the packet to ip. */
3904 	BUMP_MIB(&is->is_rawip_mib, rawipOutDatagrams);
3905 	error = conn_ip_output(mp, ixa);
3906 	/* No rawipOutErrors if an error since IP increases its error counter */
3907 	switch (error) {
3908 	case 0:
3909 		break;
3910 	case EWOULDBLOCK:
3911 		(void) ixa_check_drain_insert(connp, ixa);
3912 		error = 0;
3913 		break;
3914 	case EADDRNOTAVAIL:
3915 		/*
3916 		 * IXAF_VERIFY_SOURCE tells us to pick a better source.
3917 		 * Don't have the application see that errno
3918 		 */
3919 		error = ENETUNREACH;
3920 		/* FALLTHRU */
3921 	default:
3922 		mutex_enter(&connp->conn_lock);
3923 		/*
3924 		 * Clear the source and v6lastdst so we call ip_attr_connect
3925 		 * for the next packet and try to pick a better source.
3926 		 */
3927 		if (connp->conn_mcbc_bind)
3928 			connp->conn_saddr_v6 = ipv6_all_zeros;
3929 		else
3930 			connp->conn_saddr_v6 = connp->conn_bound_addr_v6;
3931 		connp->conn_v6lastdst = ipv6_all_zeros;
3932 		mutex_exit(&connp->conn_lock);
3933 		break;
3934 	}
3935 	ASSERT(!(ixa->ixa_free_flags & IXA_FREE_CRED));
3936 	ixa->ixa_cred = connp->conn_cred;	/* Restore */
3937 	ixa->ixa_cpid = connp->conn_cpid;
3938 	ixa_refrele(ixa);
3939 	return (error);
3940 }
3941 
3942 
3943 /*
3944  * Prepend the header template and then fill in the source and
3945  * flowinfo. The caller needs to handle the destination address since
3946  * it's setting is different if rthdr or source route.
3947  *
3948  * Returns NULL is allocation failed or if the packet would exceed IP_MAXPACKET.
3949  * When it returns NULL it sets errorp.
3950  */
3951 static mblk_t *
3952 icmp_prepend_header_template(conn_t *connp, ip_xmit_attr_t *ixa, mblk_t *mp,
3953     const in6_addr_t *v6src, uint32_t flowinfo, int *errorp)
3954 {
3955 	icmp_t		*icmp = connp->conn_icmp;
3956 	icmp_stack_t	*is = icmp->icmp_is;
3957 	uint_t		pktlen;
3958 	uint_t		copylen;
3959 	uint8_t		*iph;
3960 	uint_t		ip_hdr_length;
3961 	uint32_t	cksum;
3962 	ip_pkt_t	*ipp;
3963 
3964 	ASSERT(MUTEX_HELD(&connp->conn_lock));
3965 
3966 	/*
3967 	 * Copy the header template.
3968 	 */
3969 	copylen = connp->conn_ht_iphc_len;
3970 	pktlen = copylen + msgdsize(mp);
3971 	if (pktlen > IP_MAXPACKET) {
3972 		freemsg(mp);
3973 		*errorp = EMSGSIZE;
3974 		return (NULL);
3975 	}
3976 	ixa->ixa_pktlen = pktlen;
3977 
3978 	/* check/fix buffer config, setup pointers into it */
3979 	iph = mp->b_rptr - copylen;
3980 	if (DB_REF(mp) != 1 || iph < DB_BASE(mp) || !OK_32PTR(iph)) {
3981 		mblk_t *mp1;
3982 
3983 		mp1 = allocb(copylen + is->is_wroff_extra, BPRI_MED);
3984 		if (mp1 == NULL) {
3985 			freemsg(mp);
3986 			*errorp = ENOMEM;
3987 			return (NULL);
3988 		}
3989 		mp1->b_wptr = DB_LIM(mp1);
3990 		mp1->b_cont = mp;
3991 		mp = mp1;
3992 		iph = (mp->b_wptr - copylen);
3993 	}
3994 	mp->b_rptr = iph;
3995 	bcopy(connp->conn_ht_iphc, iph, copylen);
3996 	ip_hdr_length = (uint_t)(connp->conn_ht_ulp - connp->conn_ht_iphc);
3997 
3998 	ixa->ixa_ip_hdr_length = ip_hdr_length;
3999 
4000 	/*
4001 	 * Prepare for ICMPv6 checksum done in IP.
4002 	 *
4003 	 * icmp_build_hdr_template has already massaged any routing header
4004 	 * and placed the result in conn_sum.
4005 	 *
4006 	 * We make it easy for IP to include our pseudo header
4007 	 * by putting our length (and any routing header adjustment)
4008 	 * in the ICMPv6 checksum field.
4009 	 */
4010 	cksum = pktlen - ip_hdr_length;
4011 
4012 	cksum += connp->conn_sum;
4013 	cksum = (cksum >> 16) + (cksum & 0xFFFF);
4014 	ASSERT(cksum < 0x10000);
4015 
4016 	ipp = &connp->conn_xmit_ipp;
4017 	if (ixa->ixa_flags & IXAF_IS_IPV4) {
4018 		ipha_t	*ipha = (ipha_t *)iph;
4019 
4020 		ipha->ipha_length = htons((uint16_t)pktlen);
4021 
4022 		/* if IP_PKTINFO specified an addres it wins over bind() */
4023 		if ((ipp->ipp_fields & IPPF_ADDR) &&
4024 		    IN6_IS_ADDR_V4MAPPED(&ipp->ipp_addr)) {
4025 			ASSERT(ipp->ipp_addr_v4 != INADDR_ANY);
4026 			ipha->ipha_src = ipp->ipp_addr_v4;
4027 		} else {
4028 			IN6_V4MAPPED_TO_IPADDR(v6src, ipha->ipha_src);
4029 		}
4030 	} else {
4031 		ip6_t *ip6h = (ip6_t *)iph;
4032 		uint_t	cksum_offset = 0;
4033 
4034 		ip6h->ip6_plen =  htons((uint16_t)(pktlen - IPV6_HDR_LEN));
4035 
4036 		/* if IP_PKTINFO specified an addres it wins over bind() */
4037 		if ((ipp->ipp_fields & IPPF_ADDR) &&
4038 		    !IN6_IS_ADDR_V4MAPPED(&ipp->ipp_addr)) {
4039 			ASSERT(!IN6_IS_ADDR_UNSPECIFIED(&ipp->ipp_addr));
4040 			ip6h->ip6_src = ipp->ipp_addr;
4041 		} else {
4042 			ip6h->ip6_src = *v6src;
4043 		}
4044 		ip6h->ip6_vcf =
4045 		    (IPV6_DEFAULT_VERS_AND_FLOW & IPV6_VERS_AND_FLOW_MASK) |
4046 		    (flowinfo & ~IPV6_VERS_AND_FLOW_MASK);
4047 		if (ipp->ipp_fields & IPPF_TCLASS) {
4048 			/* Overrides the class part of flowinfo */
4049 			ip6h->ip6_vcf = IPV6_TCLASS_FLOW(ip6h->ip6_vcf,
4050 			    ipp->ipp_tclass);
4051 		}
4052 
4053 		if (ixa->ixa_flags & IXAF_SET_ULP_CKSUM) {
4054 			if (connp->conn_proto == IPPROTO_ICMPV6) {
4055 				cksum_offset = ixa->ixa_ip_hdr_length +
4056 				    offsetof(icmp6_t, icmp6_cksum);
4057 			} else if (ixa->ixa_flags & IXAF_SET_RAW_CKSUM) {
4058 				cksum_offset = ixa->ixa_ip_hdr_length +
4059 				    ixa->ixa_raw_cksum_offset;
4060 			}
4061 		}
4062 		if (cksum_offset != 0) {
4063 			uint16_t *ptr;
4064 
4065 			/* Make sure the checksum fits in the first mblk */
4066 			if (cksum_offset + sizeof (short) > MBLKL(mp)) {
4067 				mblk_t *mp1;
4068 
4069 				mp1 = msgpullup(mp,
4070 				    cksum_offset + sizeof (short));
4071 				freemsg(mp);
4072 				if (mp1 == NULL) {
4073 					*errorp = ENOMEM;
4074 					return (NULL);
4075 				}
4076 				mp = mp1;
4077 				iph = mp->b_rptr;
4078 				ip6h = (ip6_t *)iph;
4079 			}
4080 			ptr = (uint16_t *)(mp->b_rptr + cksum_offset);
4081 			*ptr = htons(cksum);
4082 		}
4083 	}
4084 
4085 	return (mp);
4086 }
4087 
4088 /*
4089  * This routine handles all messages passed downstream.  It either
4090  * consumes the message or passes it downstream; it never queues a
4091  * a message.
4092  */
4093 void
4094 icmp_wput(queue_t *q, mblk_t *mp)
4095 {
4096 	sin6_t		*sin6;
4097 	sin_t		*sin = NULL;
4098 	uint_t		srcid;
4099 	conn_t		*connp = Q_TO_CONN(q);
4100 	icmp_t		*icmp = connp->conn_icmp;
4101 	int		error = 0;
4102 	struct sockaddr	*addr = NULL;
4103 	socklen_t	addrlen;
4104 	icmp_stack_t	*is = icmp->icmp_is;
4105 	struct T_unitdata_req *tudr;
4106 	mblk_t		*data_mp;
4107 	cred_t		*cr;
4108 	pid_t		pid;
4109 
4110 	/*
4111 	 * We directly handle several cases here: T_UNITDATA_REQ message
4112 	 * coming down as M_PROTO/M_PCPROTO and M_DATA messages for connected
4113 	 * socket.
4114 	 */
4115 	switch (DB_TYPE(mp)) {
4116 	case M_DATA:
4117 		/* sockfs never sends down M_DATA */
4118 		BUMP_MIB(&is->is_rawip_mib, rawipOutErrors);
4119 		freemsg(mp);
4120 		return;
4121 
4122 	case M_PROTO:
4123 	case M_PCPROTO:
4124 		tudr = (struct T_unitdata_req *)mp->b_rptr;
4125 		if (MBLKL(mp) < sizeof (*tudr) ||
4126 		    ((t_primp_t)mp->b_rptr)->type != T_UNITDATA_REQ) {
4127 			icmp_wput_other(q, mp);
4128 			return;
4129 		}
4130 		break;
4131 
4132 	default:
4133 		icmp_wput_other(q, mp);
4134 		return;
4135 	}
4136 
4137 	/* Handle valid T_UNITDATA_REQ here */
4138 	data_mp = mp->b_cont;
4139 	if (data_mp == NULL) {
4140 		error = EPROTO;
4141 		goto ud_error2;
4142 	}
4143 	mp->b_cont = NULL;
4144 
4145 	if (!MBLKIN(mp, 0, tudr->DEST_offset + tudr->DEST_length)) {
4146 		error = EADDRNOTAVAIL;
4147 		goto ud_error2;
4148 	}
4149 
4150 	/*
4151 	 * All Solaris components should pass a db_credp
4152 	 * for this message, hence we ASSERT.
4153 	 * On production kernels we return an error to be robust against
4154 	 * random streams modules sitting on top of us.
4155 	 */
4156 	cr = msg_getcred(mp, &pid);
4157 	ASSERT(cr != NULL);
4158 	if (cr == NULL) {
4159 		error = EINVAL;
4160 		goto ud_error2;
4161 	}
4162 
4163 	/*
4164 	 * If a port has not been bound to the stream, fail.
4165 	 * This is not a problem when sockfs is directly
4166 	 * above us, because it will ensure that the socket
4167 	 * is first bound before allowing data to be sent.
4168 	 */
4169 	if (icmp->icmp_state == TS_UNBND) {
4170 		error = EPROTO;
4171 		goto ud_error2;
4172 	}
4173 	addr = (struct sockaddr *)&mp->b_rptr[tudr->DEST_offset];
4174 	addrlen = tudr->DEST_length;
4175 
4176 	switch (connp->conn_family) {
4177 	case AF_INET6:
4178 		sin6 = (sin6_t *)addr;
4179 		if (!OK_32PTR((char *)sin6) || (addrlen != sizeof (sin6_t)) ||
4180 		    (sin6->sin6_family != AF_INET6)) {
4181 			error = EADDRNOTAVAIL;
4182 			goto ud_error2;
4183 		}
4184 
4185 		/* No support for mapped addresses on raw sockets */
4186 		if (IN6_IS_ADDR_V4MAPPED(&sin6->sin6_addr)) {
4187 			error = EADDRNOTAVAIL;
4188 			goto ud_error2;
4189 		}
4190 		srcid = sin6->__sin6_src_id;
4191 
4192 		/*
4193 		 * If the local address is a mapped address return
4194 		 * an error.
4195 		 * It would be possible to send an IPv6 packet but the
4196 		 * response would never make it back to the application
4197 		 * since it is bound to a mapped address.
4198 		 */
4199 		if (IN6_IS_ADDR_V4MAPPED(&connp->conn_saddr_v6)) {
4200 			error = EADDRNOTAVAIL;
4201 			goto ud_error2;
4202 		}
4203 
4204 		if (IN6_IS_ADDR_UNSPECIFIED(&sin6->sin6_addr))
4205 			sin6->sin6_addr = ipv6_loopback;
4206 
4207 		if (tudr->OPT_length != 0) {
4208 			/*
4209 			 * If we are connected then the destination needs to be
4210 			 * the same as the connected one.
4211 			 */
4212 			if (icmp->icmp_state == TS_DATA_XFER &&
4213 			    !conn_same_as_last_v6(connp, sin6)) {
4214 				error = EISCONN;
4215 				goto ud_error2;
4216 			}
4217 			error = icmp_output_ancillary(connp, NULL, sin6,
4218 			    data_mp, mp, NULL, cr, pid);
4219 		} else {
4220 			ip_xmit_attr_t *ixa;
4221 
4222 			/*
4223 			 * We have to allocate an ip_xmit_attr_t before we grab
4224 			 * conn_lock and we need to hold conn_lock once we've
4225 			 * checked conn_same_as_last_v6 to handle concurrent
4226 			 * send* calls on a socket.
4227 			 */
4228 			ixa = conn_get_ixa(connp, B_FALSE);
4229 			if (ixa == NULL) {
4230 				error = ENOMEM;
4231 				goto ud_error2;
4232 			}
4233 			mutex_enter(&connp->conn_lock);
4234 
4235 			if (conn_same_as_last_v6(connp, sin6) &&
4236 			    connp->conn_lastsrcid == srcid &&
4237 			    ipsec_outbound_policy_current(ixa)) {
4238 				/* icmp_output_lastdst drops conn_lock */
4239 				error = icmp_output_lastdst(connp, data_mp, cr,
4240 				    pid, ixa);
4241 			} else {
4242 				/* icmp_output_newdst drops conn_lock */
4243 				error = icmp_output_newdst(connp, data_mp, NULL,
4244 				    sin6, cr, pid, ixa);
4245 			}
4246 			ASSERT(MUTEX_NOT_HELD(&connp->conn_lock));
4247 		}
4248 		if (error == 0) {
4249 			freeb(mp);
4250 			return;
4251 		}
4252 		break;
4253 
4254 	case AF_INET:
4255 		sin = (sin_t *)addr;
4256 		if ((!OK_32PTR((char *)sin) || addrlen != sizeof (sin_t)) ||
4257 		    (sin->sin_family != AF_INET)) {
4258 			error = EADDRNOTAVAIL;
4259 			goto ud_error2;
4260 		}
4261 		if (sin->sin_addr.s_addr == INADDR_ANY)
4262 			sin->sin_addr.s_addr = htonl(INADDR_LOOPBACK);
4263 
4264 		/* Protocol 255 contains full IP headers */
4265 		/* Read without holding lock */
4266 		if (icmp->icmp_hdrincl) {
4267 			if (MBLKL(data_mp) < IP_SIMPLE_HDR_LENGTH) {
4268 				if (!pullupmsg(data_mp, IP_SIMPLE_HDR_LENGTH)) {
4269 					error = EINVAL;
4270 					goto ud_error2;
4271 				}
4272 			}
4273 			error = icmp_output_hdrincl(connp, data_mp, cr, pid);
4274 			if (error == 0) {
4275 				freeb(mp);
4276 				return;
4277 			}
4278 			/* data_mp consumed above */
4279 			data_mp = NULL;
4280 			goto ud_error2;
4281 		}
4282 
4283 		if (tudr->OPT_length != 0) {
4284 			/*
4285 			 * If we are connected then the destination needs to be
4286 			 * the same as the connected one.
4287 			 */
4288 			if (icmp->icmp_state == TS_DATA_XFER &&
4289 			    !conn_same_as_last_v4(connp, sin)) {
4290 				error = EISCONN;
4291 				goto ud_error2;
4292 			}
4293 			error = icmp_output_ancillary(connp, sin, NULL,
4294 			    data_mp, mp, NULL, cr, pid);
4295 		} else {
4296 			ip_xmit_attr_t *ixa;
4297 
4298 			/*
4299 			 * We have to allocate an ip_xmit_attr_t before we grab
4300 			 * conn_lock and we need to hold conn_lock once we've
4301 			 * checked conn_same_as_last_v4 to handle concurrent
4302 			 * send* calls on a socket.
4303 			 */
4304 			ixa = conn_get_ixa(connp, B_FALSE);
4305 			if (ixa == NULL) {
4306 				error = ENOMEM;
4307 				goto ud_error2;
4308 			}
4309 			mutex_enter(&connp->conn_lock);
4310 
4311 			if (conn_same_as_last_v4(connp, sin) &&
4312 			    ipsec_outbound_policy_current(ixa)) {
4313 				/* icmp_output_lastdst drops conn_lock */
4314 				error = icmp_output_lastdst(connp, data_mp, cr,
4315 				    pid, ixa);
4316 			} else {
4317 				/* icmp_output_newdst drops conn_lock */
4318 				error = icmp_output_newdst(connp, data_mp, sin,
4319 				    NULL, cr, pid, ixa);
4320 			}
4321 			ASSERT(MUTEX_NOT_HELD(&connp->conn_lock));
4322 		}
4323 		if (error == 0) {
4324 			freeb(mp);
4325 			return;
4326 		}
4327 		break;
4328 	}
4329 	ASSERT(mp != NULL);
4330 	/* mp is freed by the following routine */
4331 	icmp_ud_err(q, mp, (t_scalar_t)error);
4332 	return;
4333 
4334 ud_error2:
4335 	BUMP_MIB(&is->is_rawip_mib, rawipOutErrors);
4336 	freemsg(data_mp);
4337 	ASSERT(mp != NULL);
4338 	/* mp is freed by the following routine */
4339 	icmp_ud_err(q, mp, (t_scalar_t)error);
4340 }
4341 
4342 /*
4343  * Handle the case of the IP address or flow label being different
4344  * for both IPv4 and IPv6.
4345  *
4346  * NOTE: The caller must hold conn_lock and we drop it here.
4347  */
4348 static int
4349 icmp_output_newdst(conn_t *connp, mblk_t *data_mp, sin_t *sin, sin6_t *sin6,
4350     cred_t *cr, pid_t pid, ip_xmit_attr_t *ixa)
4351 {
4352 	icmp_t		*icmp = connp->conn_icmp;
4353 	icmp_stack_t	*is = icmp->icmp_is;
4354 	int		error;
4355 	ip_xmit_attr_t	*oldixa;
4356 	boolean_t	do_ipsec;
4357 	uint_t		srcid;
4358 	uint32_t	flowinfo;
4359 	in6_addr_t	v6src;
4360 	in6_addr_t	v6dst;
4361 	in6_addr_t	v6nexthop;
4362 	in_port_t	dstport;
4363 
4364 	ASSERT(MUTEX_HELD(&connp->conn_lock));
4365 	ASSERT(ixa != NULL);
4366 
4367 	/*
4368 	 * We hold conn_lock across all the use and modifications of
4369 	 * the conn_lastdst, conn_ixa, and conn_xmit_ipp to ensure that they
4370 	 * stay consistent.
4371 	 */
4372 
4373 	ASSERT(cr != NULL);
4374 	ASSERT(!(ixa->ixa_free_flags & IXA_FREE_CRED));
4375 	ixa->ixa_cred = cr;
4376 	ixa->ixa_cpid = pid;
4377 	if (is_system_labeled()) {
4378 		/* We need to restart with a label based on the cred */
4379 		ip_xmit_attr_restore_tsl(ixa, ixa->ixa_cred);
4380 	}
4381 	/*
4382 	 * If we are connected then the destination needs to be the
4383 	 * same as the connected one, which is not the case here since we
4384 	 * checked for that above.
4385 	 */
4386 	if (icmp->icmp_state == TS_DATA_XFER) {
4387 		mutex_exit(&connp->conn_lock);
4388 		error = EISCONN;
4389 		goto ud_error;
4390 	}
4391 
4392 	/* In case previous destination was multicast or multirt */
4393 	ip_attr_newdst(ixa);
4394 
4395 	/*
4396 	 * If laddr is unspecified then we look at sin6_src_id.
4397 	 * We will give precedence to a source address set with IPV6_PKTINFO
4398 	 * (aka IPPF_ADDR) but that is handled in build_hdrs. However, we don't
4399 	 * want ip_attr_connect to select a source (since it can fail) when
4400 	 * IPV6_PKTINFO is specified.
4401 	 * If this doesn't result in a source address then we get a source
4402 	 * from ip_attr_connect() below.
4403 	 */
4404 	v6src = connp->conn_saddr_v6;
4405 	if (sin != NULL) {
4406 		IN6_IPADDR_TO_V4MAPPED(sin->sin_addr.s_addr, &v6dst);
4407 		dstport = sin->sin_port;
4408 		flowinfo = 0;
4409 		srcid = 0;
4410 		ixa->ixa_flags &= ~IXAF_SCOPEID_SET;
4411 		if (srcid != 0 && V4_PART_OF_V6(&v6src) == INADDR_ANY) {
4412 			ip_srcid_find_id(srcid, &v6src, IPCL_ZONEID(connp),
4413 			    connp->conn_netstack);
4414 		}
4415 		ixa->ixa_flags |= IXAF_IS_IPV4;
4416 	} else {
4417 		v6dst = sin6->sin6_addr;
4418 		dstport = sin6->sin6_port;
4419 		flowinfo = sin6->sin6_flowinfo;
4420 		srcid = sin6->__sin6_src_id;
4421 		if (IN6_IS_ADDR_LINKSCOPE(&v6dst) && sin6->sin6_scope_id != 0) {
4422 			ixa->ixa_scopeid = sin6->sin6_scope_id;
4423 			ixa->ixa_flags |= IXAF_SCOPEID_SET;
4424 		} else {
4425 			ixa->ixa_flags &= ~IXAF_SCOPEID_SET;
4426 		}
4427 		if (srcid != 0 && IN6_IS_ADDR_UNSPECIFIED(&v6src)) {
4428 			ip_srcid_find_id(srcid, &v6src, IPCL_ZONEID(connp),
4429 			    connp->conn_netstack);
4430 		}
4431 		if (IN6_IS_ADDR_V4MAPPED(&v6dst))
4432 			ixa->ixa_flags |= IXAF_IS_IPV4;
4433 		else
4434 			ixa->ixa_flags &= ~IXAF_IS_IPV4;
4435 	}
4436 	/* Handle IP_PKTINFO/IPV6_PKTINFO setting source address. */
4437 	if (connp->conn_xmit_ipp.ipp_fields & IPPF_ADDR) {
4438 		ip_pkt_t *ipp = &connp->conn_xmit_ipp;
4439 
4440 		if (ixa->ixa_flags & IXAF_IS_IPV4) {
4441 			if (IN6_IS_ADDR_V4MAPPED(&ipp->ipp_addr))
4442 				v6src = ipp->ipp_addr;
4443 		} else {
4444 			if (!IN6_IS_ADDR_V4MAPPED(&ipp->ipp_addr))
4445 				v6src = ipp->ipp_addr;
4446 		}
4447 	}
4448 
4449 	/* Defer IPsec if it might need to look at ICMP type/code */
4450 	switch (ixa->ixa_protocol) {
4451 	case IPPROTO_ICMP:
4452 	case IPPROTO_ICMPV6:
4453 		do_ipsec = B_FALSE;
4454 		break;
4455 	default:
4456 		do_ipsec = B_TRUE;
4457 	}
4458 
4459 	ip_attr_nexthop(&connp->conn_xmit_ipp, ixa, &v6dst, &v6nexthop);
4460 	mutex_exit(&connp->conn_lock);
4461 
4462 	error = ip_attr_connect(connp, ixa, &v6src, &v6dst, &v6nexthop, dstport,
4463 	    &v6src, NULL, IPDF_ALLOW_MCBC | IPDF_VERIFY_DST |
4464 	    (do_ipsec ? IPDF_IPSEC : 0));
4465 	switch (error) {
4466 	case 0:
4467 		break;
4468 	case EADDRNOTAVAIL:
4469 		/*
4470 		 * IXAF_VERIFY_SOURCE tells us to pick a better source.
4471 		 * Don't have the application see that errno
4472 		 */
4473 		error = ENETUNREACH;
4474 		goto failed;
4475 	case ENETDOWN:
4476 		/*
4477 		 * Have !ipif_addr_ready address; drop packet silently
4478 		 * until we can get applications to not send until we
4479 		 * are ready.
4480 		 */
4481 		error = 0;
4482 		goto failed;
4483 	case EHOSTUNREACH:
4484 	case ENETUNREACH:
4485 		if (ixa->ixa_ire != NULL) {
4486 			/*
4487 			 * Let conn_ip_output/ire_send_noroute return
4488 			 * the error and send any local ICMP error.
4489 			 */
4490 			error = 0;
4491 			break;
4492 		}
4493 		/* FALLTHRU */
4494 	default:
4495 	failed:
4496 		goto ud_error;
4497 	}
4498 
4499 	mutex_enter(&connp->conn_lock);
4500 	/*
4501 	 * While we dropped the lock some other thread might have connected
4502 	 * this socket. If so we bail out with EISCONN to ensure that the
4503 	 * connecting thread is the one that updates conn_ixa, conn_ht_*
4504 	 * and conn_*last*.
4505 	 */
4506 	if (icmp->icmp_state == TS_DATA_XFER) {
4507 		mutex_exit(&connp->conn_lock);
4508 		error = EISCONN;
4509 		goto ud_error;
4510 	}
4511 
4512 	/*
4513 	 * We need to rebuild the headers if
4514 	 *  - we are labeling packets (could be different for different
4515 	 *    destinations)
4516 	 *  - we have a source route (or routing header) since we need to
4517 	 *    massage that to get the pseudo-header checksum
4518 	 *  - a socket option with COA_HEADER_CHANGED has been set which
4519 	 *    set conn_v6lastdst to zero.
4520 	 *
4521 	 * Otherwise the prepend function will just update the src, dst,
4522 	 * and flow label.
4523 	 */
4524 	if (is_system_labeled()) {
4525 		/* TX MLP requires SCM_UCRED and don't have that here */
4526 		if (connp->conn_mlp_type != mlptSingle) {
4527 			mutex_exit(&connp->conn_lock);
4528 			error = ECONNREFUSED;
4529 			goto ud_error;
4530 		}
4531 		/*
4532 		 * Check whether Trusted Solaris policy allows communication
4533 		 * with this host, and pretend that the destination is
4534 		 * unreachable if not.
4535 		 * Compute any needed label and place it in ipp_label_v4/v6.
4536 		 *
4537 		 * Later conn_build_hdr_template/conn_prepend_hdr takes
4538 		 * ipp_label_v4/v6 to form the packet.
4539 		 *
4540 		 * Tsol note: Since we hold conn_lock we know no other
4541 		 * thread manipulates conn_xmit_ipp.
4542 		 */
4543 		error = conn_update_label(connp, ixa, &v6dst,
4544 		    &connp->conn_xmit_ipp);
4545 		if (error != 0) {
4546 			mutex_exit(&connp->conn_lock);
4547 			goto ud_error;
4548 		}
4549 		/* Rebuild the header template */
4550 		error = icmp_build_hdr_template(connp, &v6src, &v6dst,
4551 		    flowinfo);
4552 		if (error != 0) {
4553 			mutex_exit(&connp->conn_lock);
4554 			goto ud_error;
4555 		}
4556 	} else if (connp->conn_xmit_ipp.ipp_fields &
4557 	    (IPPF_IPV4_OPTIONS|IPPF_RTHDR) ||
4558 	    IN6_IS_ADDR_UNSPECIFIED(&connp->conn_v6lastdst)) {
4559 		/* Rebuild the header template */
4560 		error = icmp_build_hdr_template(connp, &v6src, &v6dst,
4561 		    flowinfo);
4562 		if (error != 0) {
4563 			mutex_exit(&connp->conn_lock);
4564 			goto ud_error;
4565 		}
4566 	} else {
4567 		/* Simply update the destination address if no source route */
4568 		if (ixa->ixa_flags & IXAF_IS_IPV4) {
4569 			ipha_t	*ipha = (ipha_t *)connp->conn_ht_iphc;
4570 
4571 			IN6_V4MAPPED_TO_IPADDR(&v6dst, ipha->ipha_dst);
4572 			if (ixa->ixa_flags & IXAF_PMTU_IPV4_DF) {
4573 				ipha->ipha_fragment_offset_and_flags |=
4574 				    IPH_DF_HTONS;
4575 			} else {
4576 				ipha->ipha_fragment_offset_and_flags &=
4577 				    ~IPH_DF_HTONS;
4578 			}
4579 		} else {
4580 			ip6_t *ip6h = (ip6_t *)connp->conn_ht_iphc;
4581 			ip6h->ip6_dst = v6dst;
4582 		}
4583 	}
4584 
4585 	/*
4586 	 * Remember the dst etc which corresponds to the built header
4587 	 * template and conn_ixa.
4588 	 */
4589 	oldixa = conn_replace_ixa(connp, ixa);
4590 	connp->conn_v6lastdst = v6dst;
4591 	connp->conn_lastflowinfo = flowinfo;
4592 	connp->conn_lastscopeid = ixa->ixa_scopeid;
4593 	connp->conn_lastsrcid = srcid;
4594 	/* Also remember a source to use together with lastdst */
4595 	connp->conn_v6lastsrc = v6src;
4596 
4597 	data_mp = icmp_prepend_header_template(connp, ixa, data_mp, &v6src,
4598 	    flowinfo, &error);
4599 
4600 	/* Done with conn_t */
4601 	mutex_exit(&connp->conn_lock);
4602 	ixa_refrele(oldixa);
4603 
4604 	if (data_mp == NULL) {
4605 		ASSERT(error != 0);
4606 		goto ud_error;
4607 	}
4608 
4609 	if (!do_ipsec) {
4610 		/* Policy might differ for different ICMP type/code */
4611 		data_mp = icmp_output_attach_policy(data_mp, connp, ixa);
4612 		if (data_mp == NULL) {
4613 			BUMP_MIB(&is->is_rawip_mib, rawipOutErrors);
4614 			error = EHOSTUNREACH;	/* IPsec policy failure */
4615 			goto done;
4616 		}
4617 	}
4618 
4619 	/* We're done.  Pass the packet to ip. */
4620 	BUMP_MIB(&is->is_rawip_mib, rawipOutDatagrams);
4621 
4622 	error = conn_ip_output(data_mp, ixa);
4623 	/* No rawipOutErrors if an error since IP increases its error counter */
4624 	switch (error) {
4625 	case 0:
4626 		break;
4627 	case EWOULDBLOCK:
4628 		(void) ixa_check_drain_insert(connp, ixa);
4629 		error = 0;
4630 		break;
4631 	case EADDRNOTAVAIL:
4632 		/*
4633 		 * IXAF_VERIFY_SOURCE tells us to pick a better source.
4634 		 * Don't have the application see that errno
4635 		 */
4636 		error = ENETUNREACH;
4637 		/* FALLTHRU */
4638 	default:
4639 		mutex_enter(&connp->conn_lock);
4640 		/*
4641 		 * Clear the source and v6lastdst so we call ip_attr_connect
4642 		 * for the next packet and try to pick a better source.
4643 		 */
4644 		if (connp->conn_mcbc_bind)
4645 			connp->conn_saddr_v6 = ipv6_all_zeros;
4646 		else
4647 			connp->conn_saddr_v6 = connp->conn_bound_addr_v6;
4648 		connp->conn_v6lastdst = ipv6_all_zeros;
4649 		mutex_exit(&connp->conn_lock);
4650 		break;
4651 	}
4652 done:
4653 	ASSERT(!(ixa->ixa_free_flags & IXA_FREE_CRED));
4654 	ixa->ixa_cred = connp->conn_cred;	/* Restore */
4655 	ixa->ixa_cpid = connp->conn_cpid;
4656 	ixa_refrele(ixa);
4657 	return (error);
4658 
4659 ud_error:
4660 	ASSERT(!(ixa->ixa_free_flags & IXA_FREE_CRED));
4661 	ixa->ixa_cred = connp->conn_cred;	/* Restore */
4662 	ixa->ixa_cpid = connp->conn_cpid;
4663 	ixa_refrele(ixa);
4664 
4665 	BUMP_MIB(&is->is_rawip_mib, rawipOutErrors);
4666 	freemsg(data_mp);
4667 	return (error);
4668 }
4669 
4670 /* ARGSUSED */
4671 static void
4672 icmp_wput_fallback(queue_t *q, mblk_t *mp)
4673 {
4674 #ifdef DEBUG
4675 	cmn_err(CE_CONT, "icmp_wput_fallback: Message during fallback \n");
4676 #endif
4677 	freemsg(mp);
4678 }
4679 
4680 static void
4681 icmp_wput_other(queue_t *q, mblk_t *mp)
4682 {
4683 	uchar_t	*rptr = mp->b_rptr;
4684 	struct iocblk *iocp;
4685 	conn_t	*connp = Q_TO_CONN(q);
4686 	icmp_t	*icmp = connp->conn_icmp;
4687 	cred_t *cr;
4688 
4689 	switch (mp->b_datap->db_type) {
4690 	case M_PROTO:
4691 	case M_PCPROTO:
4692 		if (mp->b_wptr - rptr < sizeof (t_scalar_t)) {
4693 			/*
4694 			 * If the message does not contain a PRIM_type,
4695 			 * throw it away.
4696 			 */
4697 			freemsg(mp);
4698 			return;
4699 		}
4700 		switch (((t_primp_t)rptr)->type) {
4701 		case T_ADDR_REQ:
4702 			icmp_addr_req(q, mp);
4703 			return;
4704 		case O_T_BIND_REQ:
4705 		case T_BIND_REQ:
4706 			icmp_tpi_bind(q, mp);
4707 			return;
4708 		case T_CONN_REQ:
4709 			icmp_tpi_connect(q, mp);
4710 			return;
4711 		case T_CAPABILITY_REQ:
4712 			icmp_capability_req(q, mp);
4713 			return;
4714 		case T_INFO_REQ:
4715 			icmp_info_req(q, mp);
4716 			return;
4717 		case T_UNITDATA_REQ:
4718 			/*
4719 			 * If a T_UNITDATA_REQ gets here, the address must
4720 			 * be bad.  Valid T_UNITDATA_REQs are handled
4721 			 * in icmp_wput.
4722 			 */
4723 			icmp_ud_err(q, mp, EADDRNOTAVAIL);
4724 			return;
4725 		case T_UNBIND_REQ:
4726 			icmp_tpi_unbind(q, mp);
4727 			return;
4728 		case T_SVR4_OPTMGMT_REQ:
4729 			/*
4730 			 * All Solaris components should pass a db_credp
4731 			 * for this TPI message, hence we ASSERT.
4732 			 * But in case there is some other M_PROTO that looks
4733 			 * like a TPI message sent by some other kernel
4734 			 * component, we check and return an error.
4735 			 */
4736 			cr = msg_getcred(mp, NULL);
4737 			ASSERT(cr != NULL);
4738 			if (cr == NULL) {
4739 				icmp_err_ack(q, mp, TSYSERR, EINVAL);
4740 				return;
4741 			}
4742 
4743 			if (!snmpcom_req(q, mp, icmp_snmp_set, ip_snmp_get,
4744 			    cr)) {
4745 				svr4_optcom_req(q, mp, cr, &icmp_opt_obj);
4746 			}
4747 			return;
4748 
4749 		case T_OPTMGMT_REQ:
4750 			/*
4751 			 * All Solaris components should pass a db_credp
4752 			 * for this TPI message, hence we ASSERT.
4753 			 * But in case there is some other M_PROTO that looks
4754 			 * like a TPI message sent by some other kernel
4755 			 * component, we check and return an error.
4756 			 */
4757 			cr = msg_getcred(mp, NULL);
4758 			ASSERT(cr != NULL);
4759 			if (cr == NULL) {
4760 				icmp_err_ack(q, mp, TSYSERR, EINVAL);
4761 				return;
4762 			}
4763 			tpi_optcom_req(q, mp, cr, &icmp_opt_obj);
4764 			return;
4765 
4766 		case T_DISCON_REQ:
4767 			icmp_tpi_disconnect(q, mp);
4768 			return;
4769 
4770 		/* The following TPI message is not supported by icmp. */
4771 		case O_T_CONN_RES:
4772 		case T_CONN_RES:
4773 			icmp_err_ack(q, mp, TNOTSUPPORT, 0);
4774 			return;
4775 
4776 		/* The following 3 TPI requests are illegal for icmp. */
4777 		case T_DATA_REQ:
4778 		case T_EXDATA_REQ:
4779 		case T_ORDREL_REQ:
4780 			icmp_err_ack(q, mp, TNOTSUPPORT, 0);
4781 			return;
4782 		default:
4783 			break;
4784 		}
4785 		break;
4786 	case M_FLUSH:
4787 		if (*rptr & FLUSHW)
4788 			flushq(q, FLUSHDATA);
4789 		break;
4790 	case M_IOCTL:
4791 		iocp = (struct iocblk *)mp->b_rptr;
4792 		switch (iocp->ioc_cmd) {
4793 		case TI_GETPEERNAME:
4794 			if (icmp->icmp_state != TS_DATA_XFER) {
4795 				/*
4796 				 * If a default destination address has not
4797 				 * been associated with the stream, then we
4798 				 * don't know the peer's name.
4799 				 */
4800 				iocp->ioc_error = ENOTCONN;
4801 				iocp->ioc_count = 0;
4802 				mp->b_datap->db_type = M_IOCACK;
4803 				qreply(q, mp);
4804 				return;
4805 			}
4806 			/* FALLTHRU */
4807 		case TI_GETMYNAME:
4808 			/*
4809 			 * For TI_GETPEERNAME and TI_GETMYNAME, we first
4810 			 * need to copyin the user's strbuf structure.
4811 			 * Processing will continue in the M_IOCDATA case
4812 			 * below.
4813 			 */
4814 			mi_copyin(q, mp, NULL,
4815 			    SIZEOF_STRUCT(strbuf, iocp->ioc_flag));
4816 			return;
4817 		default:
4818 			break;
4819 		}
4820 		break;
4821 	case M_IOCDATA:
4822 		icmp_wput_iocdata(q, mp);
4823 		return;
4824 	default:
4825 		/* Unrecognized messages are passed through without change. */
4826 		break;
4827 	}
4828 	ip_wput_nondata(q, mp);
4829 }
4830 
4831 /*
4832  * icmp_wput_iocdata is called by icmp_wput_other to handle all M_IOCDATA
4833  * messages.
4834  */
4835 static void
4836 icmp_wput_iocdata(queue_t *q, mblk_t *mp)
4837 {
4838 	mblk_t		*mp1;
4839 	STRUCT_HANDLE(strbuf, sb);
4840 	uint_t		addrlen;
4841 	conn_t		*connp = Q_TO_CONN(q);
4842 	icmp_t		*icmp = connp->conn_icmp;
4843 
4844 	/* Make sure it is one of ours. */
4845 	switch (((struct iocblk *)mp->b_rptr)->ioc_cmd) {
4846 	case TI_GETMYNAME:
4847 	case TI_GETPEERNAME:
4848 		break;
4849 	default:
4850 		ip_wput_nondata(q, mp);
4851 		return;
4852 	}
4853 
4854 	switch (mi_copy_state(q, mp, &mp1)) {
4855 	case -1:
4856 		return;
4857 	case MI_COPY_CASE(MI_COPY_IN, 1):
4858 		break;
4859 	case MI_COPY_CASE(MI_COPY_OUT, 1):
4860 		/*
4861 		 * The address has been copied out, so now
4862 		 * copyout the strbuf.
4863 		 */
4864 		mi_copyout(q, mp);
4865 		return;
4866 	case MI_COPY_CASE(MI_COPY_OUT, 2):
4867 		/*
4868 		 * The address and strbuf have been copied out.
4869 		 * We're done, so just acknowledge the original
4870 		 * M_IOCTL.
4871 		 */
4872 		mi_copy_done(q, mp, 0);
4873 		return;
4874 	default:
4875 		/*
4876 		 * Something strange has happened, so acknowledge
4877 		 * the original M_IOCTL with an EPROTO error.
4878 		 */
4879 		mi_copy_done(q, mp, EPROTO);
4880 		return;
4881 	}
4882 
4883 	/*
4884 	 * Now we have the strbuf structure for TI_GETMYNAME
4885 	 * and TI_GETPEERNAME.  Next we copyout the requested
4886 	 * address and then we'll copyout the strbuf.
4887 	 */
4888 	STRUCT_SET_HANDLE(sb, ((struct iocblk *)mp->b_rptr)->ioc_flag,
4889 	    (void *)mp1->b_rptr);
4890 
4891 	if (connp->conn_family == AF_INET)
4892 		addrlen = sizeof (sin_t);
4893 	else
4894 		addrlen = sizeof (sin6_t);
4895 
4896 	if (STRUCT_FGET(sb, maxlen) < addrlen) {
4897 		mi_copy_done(q, mp, EINVAL);
4898 		return;
4899 	}
4900 	switch (((struct iocblk *)mp->b_rptr)->ioc_cmd) {
4901 	case TI_GETMYNAME:
4902 		break;
4903 	case TI_GETPEERNAME:
4904 		if (icmp->icmp_state != TS_DATA_XFER) {
4905 			mi_copy_done(q, mp, ENOTCONN);
4906 			return;
4907 		}
4908 		break;
4909 	default:
4910 		mi_copy_done(q, mp, EPROTO);
4911 		return;
4912 	}
4913 	mp1 = mi_copyout_alloc(q, mp, STRUCT_FGETP(sb, buf), addrlen, B_TRUE);
4914 	if (!mp1)
4915 		return;
4916 
4917 	STRUCT_FSET(sb, len, addrlen);
4918 	switch (((struct iocblk *)mp->b_rptr)->ioc_cmd) {
4919 	case TI_GETMYNAME:
4920 		(void) conn_getsockname(connp, (struct sockaddr *)mp1->b_wptr,
4921 		    &addrlen);
4922 		break;
4923 	case TI_GETPEERNAME:
4924 		(void) conn_getpeername(connp, (struct sockaddr *)mp1->b_wptr,
4925 		    &addrlen);
4926 		break;
4927 	}
4928 	mp1->b_wptr += addrlen;
4929 	/* Copy out the address */
4930 	mi_copyout(q, mp);
4931 }
4932 
4933 void
4934 icmp_ddi_g_init(void)
4935 {
4936 	icmp_max_optsize = optcom_max_optsize(icmp_opt_obj.odb_opt_des_arr,
4937 	    icmp_opt_obj.odb_opt_arr_cnt);
4938 
4939 	/*
4940 	 * We want to be informed each time a stack is created or
4941 	 * destroyed in the kernel, so we can maintain the
4942 	 * set of icmp_stack_t's.
4943 	 */
4944 	netstack_register(NS_ICMP, rawip_stack_init, NULL, rawip_stack_fini);
4945 }
4946 
4947 void
4948 icmp_ddi_g_destroy(void)
4949 {
4950 	netstack_unregister(NS_ICMP);
4951 }
4952 
4953 #define	INET_NAME	"ip"
4954 
4955 /*
4956  * Initialize the ICMP stack instance.
4957  */
4958 static void *
4959 rawip_stack_init(netstackid_t stackid, netstack_t *ns)
4960 {
4961 	icmp_stack_t	*is;
4962 	int		error = 0;
4963 	size_t		arrsz;
4964 	major_t		major;
4965 
4966 	is = (icmp_stack_t *)kmem_zalloc(sizeof (*is), KM_SLEEP);
4967 	is->is_netstack = ns;
4968 
4969 	arrsz = sizeof (icmp_propinfo_tbl);
4970 	is->is_propinfo_tbl = (mod_prop_info_t *)kmem_alloc(arrsz, KM_SLEEP);
4971 	bcopy(icmp_propinfo_tbl, is->is_propinfo_tbl, arrsz);
4972 
4973 	is->is_ksp = rawip_kstat_init(stackid);
4974 
4975 	major = mod_name_to_major(INET_NAME);
4976 	error = ldi_ident_from_major(major, &is->is_ldi_ident);
4977 	ASSERT(error == 0);
4978 	return (is);
4979 }
4980 
4981 /*
4982  * Free the ICMP stack instance.
4983  */
4984 static void
4985 rawip_stack_fini(netstackid_t stackid, void *arg)
4986 {
4987 	icmp_stack_t *is = (icmp_stack_t *)arg;
4988 
4989 	kmem_free(is->is_propinfo_tbl, sizeof (icmp_propinfo_tbl));
4990 	is->is_propinfo_tbl = NULL;
4991 
4992 	rawip_kstat_fini(stackid, is->is_ksp);
4993 	is->is_ksp = NULL;
4994 	ldi_ident_release(is->is_ldi_ident);
4995 	kmem_free(is, sizeof (*is));
4996 }
4997 
4998 static void *
4999 rawip_kstat_init(netstackid_t stackid) {
5000 	kstat_t	*ksp;
5001 
5002 	rawip_named_kstat_t template = {
5003 		{ "inDatagrams",	KSTAT_DATA_UINT32, 0 },
5004 		{ "inCksumErrs",	KSTAT_DATA_UINT32, 0 },
5005 		{ "inErrors",		KSTAT_DATA_UINT32, 0 },
5006 		{ "outDatagrams",	KSTAT_DATA_UINT32, 0 },
5007 		{ "outErrors",		KSTAT_DATA_UINT32, 0 },
5008 	};
5009 
5010 	ksp = kstat_create_netstack("icmp", 0, "rawip", "mib2",
5011 					KSTAT_TYPE_NAMED,
5012 					NUM_OF_FIELDS(rawip_named_kstat_t),
5013 					0, stackid);
5014 	if (ksp == NULL || ksp->ks_data == NULL)
5015 		return (NULL);
5016 
5017 	bcopy(&template, ksp->ks_data, sizeof (template));
5018 	ksp->ks_update = rawip_kstat_update;
5019 	ksp->ks_private = (void *)(uintptr_t)stackid;
5020 
5021 	kstat_install(ksp);
5022 	return (ksp);
5023 }
5024 
5025 static void
5026 rawip_kstat_fini(netstackid_t stackid, kstat_t *ksp)
5027 {
5028 	if (ksp != NULL) {
5029 		ASSERT(stackid == (netstackid_t)(uintptr_t)ksp->ks_private);
5030 		kstat_delete_netstack(ksp, stackid);
5031 	}
5032 }
5033 
5034 static int
5035 rawip_kstat_update(kstat_t *ksp, int rw)
5036 {
5037 	rawip_named_kstat_t *rawipkp;
5038 	netstackid_t	stackid = (netstackid_t)(uintptr_t)ksp->ks_private;
5039 	netstack_t	*ns;
5040 	icmp_stack_t	*is;
5041 
5042 	if ((ksp == NULL) || (ksp->ks_data == NULL))
5043 		return (EIO);
5044 
5045 	if (rw == KSTAT_WRITE)
5046 		return (EACCES);
5047 
5048 	rawipkp = (rawip_named_kstat_t *)ksp->ks_data;
5049 
5050 	ns = netstack_find_by_stackid(stackid);
5051 	if (ns == NULL)
5052 		return (-1);
5053 	is = ns->netstack_icmp;
5054 	if (is == NULL) {
5055 		netstack_rele(ns);
5056 		return (-1);
5057 	}
5058 	rawipkp->inDatagrams.value.ui32 =  is->is_rawip_mib.rawipInDatagrams;
5059 	rawipkp->inCksumErrs.value.ui32 =  is->is_rawip_mib.rawipInCksumErrs;
5060 	rawipkp->inErrors.value.ui32 =	   is->is_rawip_mib.rawipInErrors;
5061 	rawipkp->outDatagrams.value.ui32 = is->is_rawip_mib.rawipOutDatagrams;
5062 	rawipkp->outErrors.value.ui32 =	   is->is_rawip_mib.rawipOutErrors;
5063 	netstack_rele(ns);
5064 	return (0);
5065 }
5066 
5067 /* ARGSUSED */
5068 int
5069 rawip_accept(sock_lower_handle_t lproto_handle,
5070     sock_lower_handle_t eproto_handle, sock_upper_handle_t sock_handle,
5071     cred_t *cr)
5072 {
5073 	return (EOPNOTSUPP);
5074 }
5075 
5076 /* ARGSUSED */
5077 int
5078 rawip_bind(sock_lower_handle_t proto_handle, struct sockaddr *sa,
5079     socklen_t len, cred_t *cr)
5080 {
5081 	conn_t  *connp = (conn_t *)proto_handle;
5082 	int	error;
5083 
5084 	/* All Solaris components should pass a cred for this operation. */
5085 	ASSERT(cr != NULL);
5086 
5087 	/* Binding to a NULL address really means unbind */
5088 	if (sa == NULL)
5089 		error = rawip_do_unbind(connp);
5090 	else
5091 		error = rawip_do_bind(connp, sa, len);
5092 
5093 	if (error < 0) {
5094 		if (error == -TOUTSTATE)
5095 			error = EINVAL;
5096 		else
5097 			error = proto_tlitosyserr(-error);
5098 	}
5099 	return (error);
5100 }
5101 
5102 static int
5103 rawip_implicit_bind(conn_t *connp)
5104 {
5105 	sin6_t sin6addr;
5106 	sin_t *sin;
5107 	sin6_t *sin6;
5108 	socklen_t len;
5109 	int error;
5110 
5111 	if (connp->conn_family == AF_INET) {
5112 		len = sizeof (struct sockaddr_in);
5113 		sin = (sin_t *)&sin6addr;
5114 		*sin = sin_null;
5115 		sin->sin_family = AF_INET;
5116 		sin->sin_addr.s_addr = INADDR_ANY;
5117 	} else {
5118 		ASSERT(connp->conn_family == AF_INET6);
5119 		len = sizeof (sin6_t);
5120 		sin6 = (sin6_t *)&sin6addr;
5121 		*sin6 = sin6_null;
5122 		sin6->sin6_family = AF_INET6;
5123 		V6_SET_ZERO(sin6->sin6_addr);
5124 	}
5125 
5126 	error = rawip_do_bind(connp, (struct sockaddr *)&sin6addr, len);
5127 
5128 	return ((error < 0) ? proto_tlitosyserr(-error) : error);
5129 }
5130 
5131 static int
5132 rawip_unbind(conn_t *connp)
5133 {
5134 	int error;
5135 
5136 	error = rawip_do_unbind(connp);
5137 	if (error < 0) {
5138 		error = proto_tlitosyserr(-error);
5139 	}
5140 	return (error);
5141 }
5142 
5143 /* ARGSUSED */
5144 int
5145 rawip_listen(sock_lower_handle_t proto_handle, int backlog, cred_t *cr)
5146 {
5147 	return (EOPNOTSUPP);
5148 }
5149 
5150 int
5151 rawip_connect(sock_lower_handle_t proto_handle, const struct sockaddr *sa,
5152     socklen_t len, sock_connid_t *id, cred_t *cr)
5153 {
5154 	conn_t	*connp = (conn_t *)proto_handle;
5155 	icmp_t *icmp = connp->conn_icmp;
5156 	int	error;
5157 	boolean_t did_bind = B_FALSE;
5158 	pid_t	pid = curproc->p_pid;
5159 
5160 	/* All Solaris components should pass a cred for this operation. */
5161 	ASSERT(cr != NULL);
5162 
5163 	if (sa == NULL) {
5164 		/*
5165 		 * Disconnect
5166 		 * Make sure we are connected
5167 		 */
5168 		if (icmp->icmp_state != TS_DATA_XFER)
5169 			return (EINVAL);
5170 
5171 		error = icmp_disconnect(connp);
5172 		return (error);
5173 	}
5174 
5175 	error = proto_verify_ip_addr(connp->conn_family, sa, len);
5176 	if (error != 0)
5177 		return (error);
5178 
5179 	/* do an implicit bind if necessary */
5180 	if (icmp->icmp_state == TS_UNBND) {
5181 		error = rawip_implicit_bind(connp);
5182 		/*
5183 		 * We could be racing with an actual bind, in which case
5184 		 * we would see EPROTO. We cross our fingers and try
5185 		 * to connect.
5186 		 */
5187 		if (!(error == 0 || error == EPROTO))
5188 			return (error);
5189 		did_bind = B_TRUE;
5190 	}
5191 
5192 	/*
5193 	 * set SO_DGRAM_ERRIND
5194 	 */
5195 	connp->conn_dgram_errind = B_TRUE;
5196 
5197 	error = rawip_do_connect(connp, sa, len, cr, pid);
5198 	if (error != 0 && did_bind) {
5199 		int unbind_err;
5200 
5201 		unbind_err = rawip_unbind(connp);
5202 		ASSERT(unbind_err == 0);
5203 	}
5204 
5205 	if (error == 0) {
5206 		*id = 0;
5207 		(*connp->conn_upcalls->su_connected)(connp->conn_upper_handle,
5208 		    0, NULL, -1);
5209 	} else if (error < 0) {
5210 		error = proto_tlitosyserr(-error);
5211 	}
5212 	return (error);
5213 }
5214 
5215 /* ARGSUSED2 */
5216 int
5217 rawip_fallback(sock_lower_handle_t proto_handle, queue_t *q,
5218     boolean_t direct_sockfs, so_proto_quiesced_cb_t quiesced_cb)
5219 {
5220 	conn_t  *connp = (conn_t *)proto_handle;
5221 	icmp_t	*icmp;
5222 	struct T_capability_ack tca;
5223 	struct sockaddr_in6 laddr, faddr;
5224 	socklen_t laddrlen, faddrlen;
5225 	short opts;
5226 	struct stroptions *stropt;
5227 	mblk_t *stropt_mp;
5228 	int error;
5229 
5230 	icmp = connp->conn_icmp;
5231 
5232 	stropt_mp = allocb_wait(sizeof (*stropt), BPRI_HI, STR_NOSIG, NULL);
5233 
5234 	/*
5235 	 * setup the fallback stream that was allocated
5236 	 */
5237 	connp->conn_dev = (dev_t)RD(q)->q_ptr;
5238 	connp->conn_minor_arena = WR(q)->q_ptr;
5239 
5240 	RD(q)->q_ptr = WR(q)->q_ptr = connp;
5241 
5242 	WR(q)->q_qinfo = &icmpwinit;
5243 
5244 	connp->conn_rq = RD(q);
5245 	connp->conn_wq = WR(q);
5246 
5247 	/* Notify stream head about options before sending up data */
5248 	stropt_mp->b_datap->db_type = M_SETOPTS;
5249 	stropt_mp->b_wptr += sizeof (*stropt);
5250 	stropt = (struct stroptions *)stropt_mp->b_rptr;
5251 	stropt->so_flags = SO_WROFF | SO_HIWAT;
5252 	stropt->so_wroff = connp->conn_wroff;
5253 	stropt->so_hiwat = connp->conn_rcvbuf;
5254 	putnext(RD(q), stropt_mp);
5255 
5256 	/*
5257 	 * free helper stream
5258 	 */
5259 	ip_free_helper_stream(connp);
5260 
5261 	/*
5262 	 * Collect the information needed to sync with the sonode
5263 	 */
5264 	icmp_do_capability_ack(icmp, &tca, TC1_INFO);
5265 
5266 	laddrlen = faddrlen = sizeof (sin6_t);
5267 	(void) rawip_getsockname((sock_lower_handle_t)connp,
5268 	    (struct sockaddr *)&laddr, &laddrlen, CRED());
5269 	error = rawip_getpeername((sock_lower_handle_t)connp,
5270 	    (struct sockaddr *)&faddr, &faddrlen, CRED());
5271 	if (error != 0)
5272 		faddrlen = 0;
5273 	opts = 0;
5274 	if (connp->conn_dgram_errind)
5275 		opts |= SO_DGRAM_ERRIND;
5276 	if (connp->conn_ixa->ixa_flags & IXAF_DONTROUTE)
5277 		opts |= SO_DONTROUTE;
5278 
5279 	(*quiesced_cb)(connp->conn_upper_handle, q, &tca,
5280 	    (struct sockaddr *)&laddr, laddrlen,
5281 	    (struct sockaddr *)&faddr, faddrlen, opts);
5282 
5283 	/*
5284 	 * Attempts to send data up during fallback will result in it being
5285 	 * queued in icmp_t. Now we push up any queued packets.
5286 	 */
5287 	mutex_enter(&icmp->icmp_recv_lock);
5288 	while (icmp->icmp_fallback_queue_head != NULL) {
5289 		mblk_t	*mp;
5290 
5291 		mp = icmp->icmp_fallback_queue_head;
5292 		icmp->icmp_fallback_queue_head = mp->b_next;
5293 		mp->b_next = NULL;
5294 		mutex_exit(&icmp->icmp_recv_lock);
5295 		putnext(RD(q), mp);
5296 		mutex_enter(&icmp->icmp_recv_lock);
5297 	}
5298 	icmp->icmp_fallback_queue_tail = icmp->icmp_fallback_queue_head;
5299 
5300 	/*
5301 	 * No longer a streams less socket
5302 	 */
5303 	mutex_enter(&connp->conn_lock);
5304 	connp->conn_flags &= ~IPCL_NONSTR;
5305 	mutex_exit(&connp->conn_lock);
5306 
5307 	mutex_exit(&icmp->icmp_recv_lock);
5308 
5309 	ASSERT(icmp->icmp_fallback_queue_head == NULL &&
5310 	    icmp->icmp_fallback_queue_tail == NULL);
5311 
5312 	ASSERT(connp->conn_ref >= 1);
5313 
5314 	return (0);
5315 }
5316 
5317 /* ARGSUSED2 */
5318 sock_lower_handle_t
5319 rawip_create(int family, int type, int proto, sock_downcalls_t **sock_downcalls,
5320     uint_t *smodep, int *errorp, int flags, cred_t *credp)
5321 {
5322 	conn_t *connp;
5323 
5324 	if (type != SOCK_RAW || (family != AF_INET && family != AF_INET6)) {
5325 		*errorp = EPROTONOSUPPORT;
5326 		return (NULL);
5327 	}
5328 
5329 	connp = rawip_do_open(family, credp, errorp, flags);
5330 	if (connp != NULL) {
5331 		connp->conn_flags |= IPCL_NONSTR;
5332 
5333 		mutex_enter(&connp->conn_lock);
5334 		connp->conn_state_flags &= ~CONN_INCIPIENT;
5335 		mutex_exit(&connp->conn_lock);
5336 		*sock_downcalls = &sock_rawip_downcalls;
5337 		*smodep = SM_ATOMIC;
5338 	} else {
5339 		ASSERT(*errorp != 0);
5340 	}
5341 
5342 	return ((sock_lower_handle_t)connp);
5343 }
5344 
5345 /* ARGSUSED3 */
5346 void
5347 rawip_activate(sock_lower_handle_t proto_handle,
5348     sock_upper_handle_t sock_handle, sock_upcalls_t *sock_upcalls, int flags,
5349     cred_t *cr)
5350 {
5351 	conn_t 			*connp = (conn_t *)proto_handle;
5352 	struct sock_proto_props sopp;
5353 
5354 	/* All Solaris components should pass a cred for this operation. */
5355 	ASSERT(cr != NULL);
5356 
5357 	connp->conn_upcalls = sock_upcalls;
5358 	connp->conn_upper_handle = sock_handle;
5359 
5360 	sopp.sopp_flags = SOCKOPT_WROFF | SOCKOPT_RCVHIWAT | SOCKOPT_RCVLOWAT |
5361 	    SOCKOPT_MAXBLK | SOCKOPT_MAXPSZ | SOCKOPT_MINPSZ;
5362 	sopp.sopp_wroff = connp->conn_wroff;
5363 	sopp.sopp_rxhiwat = connp->conn_rcvbuf;
5364 	sopp.sopp_rxlowat = connp->conn_rcvlowat;
5365 	sopp.sopp_maxblk = INFPSZ;
5366 	sopp.sopp_maxpsz = IP_MAXPACKET;
5367 	sopp.sopp_minpsz = (icmp_mod_info.mi_minpsz == 1) ? 0 :
5368 	    icmp_mod_info.mi_minpsz;
5369 
5370 	(*connp->conn_upcalls->su_set_proto_props)
5371 	    (connp->conn_upper_handle, &sopp);
5372 
5373 	icmp_bind_proto(connp->conn_icmp);
5374 }
5375 
5376 /* ARGSUSED3 */
5377 int
5378 rawip_getpeername(sock_lower_handle_t proto_handle, struct sockaddr *sa,
5379     socklen_t *salenp, cred_t *cr)
5380 {
5381 	conn_t  *connp = (conn_t *)proto_handle;
5382 	icmp_t  *icmp = connp->conn_icmp;
5383 	int	error;
5384 
5385 	/* All Solaris components should pass a cred for this operation. */
5386 	ASSERT(cr != NULL);
5387 
5388 	mutex_enter(&connp->conn_lock);
5389 	if (icmp->icmp_state != TS_DATA_XFER)
5390 		error = ENOTCONN;
5391 	else
5392 		error = conn_getpeername(connp, sa, salenp);
5393 	mutex_exit(&connp->conn_lock);
5394 	return (error);
5395 }
5396 
5397 /* ARGSUSED3 */
5398 int
5399 rawip_getsockname(sock_lower_handle_t proto_handle, struct sockaddr *sa,
5400     socklen_t *salenp, cred_t *cr)
5401 {
5402 	conn_t  *connp = (conn_t *)proto_handle;
5403 	int	error;
5404 
5405 	/* All Solaris components should pass a cred for this operation. */
5406 	ASSERT(cr != NULL);
5407 
5408 	mutex_enter(&connp->conn_lock);
5409 	error = conn_getsockname(connp, sa, salenp);
5410 	mutex_exit(&connp->conn_lock);
5411 	return (error);
5412 }
5413 
5414 int
5415 rawip_setsockopt(sock_lower_handle_t proto_handle, int level, int option_name,
5416     const void *optvalp, socklen_t optlen, cred_t *cr)
5417 {
5418 	conn_t	*connp = (conn_t *)proto_handle;
5419 	int error;
5420 
5421 	/* All Solaris components should pass a cred for this operation. */
5422 	ASSERT(cr != NULL);
5423 
5424 	error = proto_opt_check(level, option_name, optlen, NULL,
5425 	    icmp_opt_obj.odb_opt_des_arr,
5426 	    icmp_opt_obj.odb_opt_arr_cnt,
5427 	    B_TRUE, B_FALSE, cr);
5428 
5429 	if (error != 0) {
5430 		/*
5431 		 * option not recognized
5432 		 */
5433 		if (error < 0) {
5434 			error = proto_tlitosyserr(-error);
5435 		}
5436 		return (error);
5437 	}
5438 
5439 	error = icmp_opt_set(connp, SETFN_OPTCOM_NEGOTIATE, level,
5440 	    option_name, optlen, (uchar_t *)optvalp, (uint_t *)&optlen,
5441 	    (uchar_t *)optvalp, NULL, cr);
5442 
5443 	ASSERT(error >= 0);
5444 
5445 	return (error);
5446 }
5447 
5448 int
5449 rawip_getsockopt(sock_lower_handle_t proto_handle, int level, int option_name,
5450     void *optvalp, socklen_t *optlen, cred_t *cr)
5451 {
5452 	int		error;
5453 	conn_t		*connp = (conn_t *)proto_handle;
5454 	t_uscalar_t	max_optbuf_len;
5455 	void		*optvalp_buf;
5456 	int		len;
5457 
5458 	/* All Solaris components should pass a cred for this operation. */
5459 	ASSERT(cr != NULL);
5460 
5461 	error = proto_opt_check(level, option_name, *optlen, &max_optbuf_len,
5462 	    icmp_opt_obj.odb_opt_des_arr,
5463 	    icmp_opt_obj.odb_opt_arr_cnt,
5464 	    B_FALSE, B_TRUE, cr);
5465 
5466 	if (error != 0) {
5467 		if (error < 0) {
5468 			error = proto_tlitosyserr(-error);
5469 		}
5470 		return (error);
5471 	}
5472 
5473 	optvalp_buf = kmem_alloc(max_optbuf_len, KM_SLEEP);
5474 	len = icmp_opt_get(connp, level, option_name, optvalp_buf);
5475 	if (len == -1) {
5476 		kmem_free(optvalp_buf, max_optbuf_len);
5477 		return (EINVAL);
5478 	}
5479 
5480 	/*
5481 	 * update optlen and copy option value
5482 	 */
5483 	t_uscalar_t size = MIN(len, *optlen);
5484 
5485 	bcopy(optvalp_buf, optvalp, size);
5486 	bcopy(&size, optlen, sizeof (size));
5487 
5488 	kmem_free(optvalp_buf, max_optbuf_len);
5489 	return (0);
5490 }
5491 
5492 /* ARGSUSED1 */
5493 int
5494 rawip_close(sock_lower_handle_t proto_handle, int flags, cred_t *cr)
5495 {
5496 	conn_t	*connp = (conn_t *)proto_handle;
5497 
5498 	/* All Solaris components should pass a cred for this operation. */
5499 	ASSERT(cr != NULL);
5500 
5501 	(void) rawip_do_close(connp);
5502 	return (0);
5503 }
5504 
5505 /* ARGSUSED2 */
5506 int
5507 rawip_shutdown(sock_lower_handle_t proto_handle, int how, cred_t *cr)
5508 {
5509 	conn_t  *connp = (conn_t *)proto_handle;
5510 
5511 	/* All Solaris components should pass a cred for this operation. */
5512 	ASSERT(cr != NULL);
5513 
5514 	/* shut down the send side */
5515 	if (how != SHUT_RD)
5516 		(*connp->conn_upcalls->su_opctl)(connp->conn_upper_handle,
5517 		    SOCK_OPCTL_SHUT_SEND, 0);
5518 	/* shut down the recv side */
5519 	if (how != SHUT_WR)
5520 		(*connp->conn_upcalls->su_opctl)(connp->conn_upper_handle,
5521 		    SOCK_OPCTL_SHUT_RECV, 0);
5522 	return (0);
5523 }
5524 
5525 void
5526 rawip_clr_flowctrl(sock_lower_handle_t proto_handle)
5527 {
5528 	conn_t  *connp = (conn_t *)proto_handle;
5529 	icmp_t	*icmp = connp->conn_icmp;
5530 
5531 	mutex_enter(&icmp->icmp_recv_lock);
5532 	connp->conn_flow_cntrld = B_FALSE;
5533 	mutex_exit(&icmp->icmp_recv_lock);
5534 }
5535 
5536 int
5537 rawip_ioctl(sock_lower_handle_t proto_handle, int cmd, intptr_t arg,
5538     int mode, int32_t *rvalp, cred_t *cr)
5539 {
5540 	conn_t  	*connp = (conn_t *)proto_handle;
5541 	int		error;
5542 
5543 	/* All Solaris components should pass a cred for this operation. */
5544 	ASSERT(cr != NULL);
5545 
5546 	/*
5547 	 * If we don't have a helper stream then create one.
5548 	 * ip_create_helper_stream takes care of locking the conn_t,
5549 	 * so this check for NULL is just a performance optimization.
5550 	 */
5551 	if (connp->conn_helper_info == NULL) {
5552 		icmp_stack_t *is = connp->conn_icmp->icmp_is;
5553 
5554 		ASSERT(is->is_ldi_ident != NULL);
5555 
5556 		/*
5557 		 * Create a helper stream for non-STREAMS socket.
5558 		 */
5559 		error = ip_create_helper_stream(connp, is->is_ldi_ident);
5560 		if (error != 0) {
5561 			ip0dbg(("rawip_ioctl: create of IP helper stream "
5562 			    "failed %d\n", error));
5563 			return (error);
5564 		}
5565 	}
5566 
5567 	switch (cmd) {
5568 	case _SIOCSOCKFALLBACK:
5569 	case TI_GETPEERNAME:
5570 	case TI_GETMYNAME:
5571 #ifdef DEBUG
5572 		cmn_err(CE_CONT, "icmp_ioctl cmd 0x%x on non streams"
5573 		    " socket", cmd);
5574 #endif
5575 		error = EINVAL;
5576 		break;
5577 	default:
5578 		/*
5579 		 * Pass on to IP using helper stream
5580 		 */
5581 		error = ldi_ioctl(connp->conn_helper_info->iphs_handle,
5582 		    cmd, arg, mode, cr, rvalp);
5583 		break;
5584 	}
5585 	return (error);
5586 }
5587 
5588 int
5589 rawip_send(sock_lower_handle_t proto_handle, mblk_t *mp, struct nmsghdr *msg,
5590     cred_t *cr)
5591 {
5592 	sin6_t		*sin6;
5593 	sin_t		*sin = NULL;
5594 	uint_t		srcid;
5595 	conn_t		*connp = (conn_t *)proto_handle;
5596 	icmp_t		*icmp = connp->conn_icmp;
5597 	int		error = 0;
5598 	icmp_stack_t	*is = icmp->icmp_is;
5599 	pid_t		pid = curproc->p_pid;
5600 	ip_xmit_attr_t	*ixa;
5601 
5602 	ASSERT(DB_TYPE(mp) == M_DATA);
5603 
5604 	/* All Solaris components should pass a cred for this operation. */
5605 	ASSERT(cr != NULL);
5606 
5607 	/* do an implicit bind if necessary */
5608 	if (icmp->icmp_state == TS_UNBND) {
5609 		error = rawip_implicit_bind(connp);
5610 		/*
5611 		 * We could be racing with an actual bind, in which case
5612 		 * we would see EPROTO. We cross our fingers and try
5613 		 * to connect.
5614 		 */
5615 		if (!(error == 0 || error == EPROTO)) {
5616 			freemsg(mp);
5617 			return (error);
5618 		}
5619 	}
5620 
5621 	/* Protocol 255 contains full IP headers */
5622 	/* Read without holding lock */
5623 	if (icmp->icmp_hdrincl) {
5624 		ASSERT(connp->conn_ipversion == IPV4_VERSION);
5625 		if (mp->b_wptr - mp->b_rptr < IP_SIMPLE_HDR_LENGTH) {
5626 			if (!pullupmsg(mp, IP_SIMPLE_HDR_LENGTH)) {
5627 				BUMP_MIB(&is->is_rawip_mib, rawipOutErrors);
5628 				freemsg(mp);
5629 				return (EINVAL);
5630 			}
5631 		}
5632 		error = icmp_output_hdrincl(connp, mp, cr, pid);
5633 		if (is->is_sendto_ignerr)
5634 			return (0);
5635 		else
5636 			return (error);
5637 	}
5638 
5639 	/* Connected? */
5640 	if (msg->msg_name == NULL) {
5641 		if (icmp->icmp_state != TS_DATA_XFER) {
5642 			BUMP_MIB(&is->is_rawip_mib, rawipOutErrors);
5643 			return (EDESTADDRREQ);
5644 		}
5645 		if (msg->msg_controllen != 0) {
5646 			error = icmp_output_ancillary(connp, NULL, NULL, mp,
5647 			    NULL, msg, cr, pid);
5648 		} else {
5649 			error = icmp_output_connected(connp, mp, cr, pid);
5650 		}
5651 		if (is->is_sendto_ignerr)
5652 			return (0);
5653 		else
5654 			return (error);
5655 	}
5656 	if (icmp->icmp_state == TS_DATA_XFER) {
5657 		BUMP_MIB(&is->is_rawip_mib, rawipOutErrors);
5658 		return (EISCONN);
5659 	}
5660 	error = proto_verify_ip_addr(connp->conn_family,
5661 	    (struct sockaddr *)msg->msg_name, msg->msg_namelen);
5662 	if (error != 0) {
5663 		BUMP_MIB(&is->is_rawip_mib, rawipOutErrors);
5664 		return (error);
5665 	}
5666 	switch (connp->conn_family) {
5667 	case AF_INET6:
5668 		sin6 = (sin6_t *)msg->msg_name;
5669 
5670 		/* No support for mapped addresses on raw sockets */
5671 		if (IN6_IS_ADDR_V4MAPPED(&sin6->sin6_addr)) {
5672 			BUMP_MIB(&is->is_rawip_mib, rawipOutErrors);
5673 			return (EADDRNOTAVAIL);
5674 		}
5675 		srcid = sin6->__sin6_src_id;
5676 
5677 		/*
5678 		 * If the local address is a mapped address return
5679 		 * an error.
5680 		 * It would be possible to send an IPv6 packet but the
5681 		 * response would never make it back to the application
5682 		 * since it is bound to a mapped address.
5683 		 */
5684 		if (IN6_IS_ADDR_V4MAPPED(&connp->conn_saddr_v6)) {
5685 			BUMP_MIB(&is->is_rawip_mib, rawipOutErrors);
5686 			return (EADDRNOTAVAIL);
5687 		}
5688 
5689 		if (IN6_IS_ADDR_UNSPECIFIED(&sin6->sin6_addr))
5690 			sin6->sin6_addr = ipv6_loopback;
5691 
5692 		/*
5693 		 * We have to allocate an ip_xmit_attr_t before we grab
5694 		 * conn_lock and we need to hold conn_lock once we've check
5695 		 * conn_same_as_last_v6 to handle concurrent send* calls on a
5696 		 * socket.
5697 		 */
5698 		if (msg->msg_controllen == 0) {
5699 			ixa = conn_get_ixa(connp, B_FALSE);
5700 			if (ixa == NULL) {
5701 				BUMP_MIB(&is->is_rawip_mib, rawipOutErrors);
5702 				return (ENOMEM);
5703 			}
5704 		} else {
5705 			ixa = NULL;
5706 		}
5707 		mutex_enter(&connp->conn_lock);
5708 		if (icmp->icmp_delayed_error != 0) {
5709 			sin6_t  *sin2 = (sin6_t *)&icmp->icmp_delayed_addr;
5710 
5711 			error = icmp->icmp_delayed_error;
5712 			icmp->icmp_delayed_error = 0;
5713 
5714 			/* Compare IP address and family */
5715 
5716 			if (IN6_ARE_ADDR_EQUAL(&sin6->sin6_addr,
5717 			    &sin2->sin6_addr) &&
5718 			    sin6->sin6_family == sin2->sin6_family) {
5719 				mutex_exit(&connp->conn_lock);
5720 				BUMP_MIB(&is->is_rawip_mib, rawipOutErrors);
5721 				if (ixa != NULL)
5722 					ixa_refrele(ixa);
5723 				return (error);
5724 			}
5725 		}
5726 		if (msg->msg_controllen != 0) {
5727 			mutex_exit(&connp->conn_lock);
5728 			ASSERT(ixa == NULL);
5729 			error = icmp_output_ancillary(connp, NULL, sin6, mp,
5730 			    NULL, msg, cr, pid);
5731 		} else if (conn_same_as_last_v6(connp, sin6) &&
5732 		    connp->conn_lastsrcid == srcid &&
5733 		    ipsec_outbound_policy_current(ixa)) {
5734 			/* icmp_output_lastdst drops conn_lock */
5735 			error = icmp_output_lastdst(connp, mp, cr, pid, ixa);
5736 		} else {
5737 			/* icmp_output_newdst drops conn_lock */
5738 			error = icmp_output_newdst(connp, mp, NULL, sin6, cr,
5739 			    pid, ixa);
5740 		}
5741 		ASSERT(MUTEX_NOT_HELD(&connp->conn_lock));
5742 		if (is->is_sendto_ignerr)
5743 			return (0);
5744 		else
5745 			return (error);
5746 	case AF_INET:
5747 		sin = (sin_t *)msg->msg_name;
5748 
5749 		if (sin->sin_addr.s_addr == INADDR_ANY)
5750 			sin->sin_addr.s_addr = htonl(INADDR_LOOPBACK);
5751 
5752 		/*
5753 		 * We have to allocate an ip_xmit_attr_t before we grab
5754 		 * conn_lock and we need to hold conn_lock once we've check
5755 		 * conn_same_as_last_v6 to handle concurrent send* on a socket.
5756 		 */
5757 		if (msg->msg_controllen == 0) {
5758 			ixa = conn_get_ixa(connp, B_FALSE);
5759 			if (ixa == NULL) {
5760 				BUMP_MIB(&is->is_rawip_mib, rawipOutErrors);
5761 				return (ENOMEM);
5762 			}
5763 		} else {
5764 			ixa = NULL;
5765 		}
5766 		mutex_enter(&connp->conn_lock);
5767 		if (icmp->icmp_delayed_error != 0) {
5768 			sin_t  *sin2 = (sin_t *)&icmp->icmp_delayed_addr;
5769 
5770 			error = icmp->icmp_delayed_error;
5771 			icmp->icmp_delayed_error = 0;
5772 
5773 			/* Compare IP address */
5774 
5775 			if (sin->sin_addr.s_addr == sin2->sin_addr.s_addr) {
5776 				mutex_exit(&connp->conn_lock);
5777 				BUMP_MIB(&is->is_rawip_mib, rawipOutErrors);
5778 				if (ixa != NULL)
5779 					ixa_refrele(ixa);
5780 				return (error);
5781 			}
5782 		}
5783 
5784 		if (msg->msg_controllen != 0) {
5785 			mutex_exit(&connp->conn_lock);
5786 			ASSERT(ixa == NULL);
5787 			error = icmp_output_ancillary(connp, sin, NULL, mp,
5788 			    NULL, msg, cr, pid);
5789 		} else if (conn_same_as_last_v4(connp, sin) &&
5790 		    ipsec_outbound_policy_current(ixa)) {
5791 			/* icmp_output_lastdst drops conn_lock */
5792 			error = icmp_output_lastdst(connp, mp, cr, pid, ixa);
5793 		} else {
5794 			/* icmp_output_newdst drops conn_lock */
5795 			error = icmp_output_newdst(connp, mp, sin, NULL, cr,
5796 			    pid, ixa);
5797 		}
5798 		ASSERT(MUTEX_NOT_HELD(&connp->conn_lock));
5799 		if (is->is_sendto_ignerr)
5800 			return (0);
5801 		else
5802 			return (error);
5803 	default:
5804 		return (EINVAL);
5805 	}
5806 }
5807 
5808 sock_downcalls_t sock_rawip_downcalls = {
5809 	rawip_activate,
5810 	rawip_accept,
5811 	rawip_bind,
5812 	rawip_listen,
5813 	rawip_connect,
5814 	rawip_getpeername,
5815 	rawip_getsockname,
5816 	rawip_getsockopt,
5817 	rawip_setsockopt,
5818 	rawip_send,
5819 	NULL,
5820 	NULL,
5821 	NULL,
5822 	rawip_shutdown,
5823 	rawip_clr_flowctrl,
5824 	rawip_ioctl,
5825 	rawip_close
5826 };
5827