xref: /titanic_52/usr/src/uts/common/inet/ip/icmp.c (revision cb8a054b1ab30d5caa746e6c44f29d4c9d3071c1)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright (c) 1991, 2010, Oracle and/or its affiliates. All rights reserved.
23  */
24 /* Copyright (c) 1990 Mentat Inc. */
25 
26 #include <sys/types.h>
27 #include <sys/stream.h>
28 #include <sys/stropts.h>
29 #include <sys/strlog.h>
30 #include <sys/strsun.h>
31 #define	_SUN_TPI_VERSION 2
32 #include <sys/tihdr.h>
33 #include <sys/timod.h>
34 #include <sys/ddi.h>
35 #include <sys/sunddi.h>
36 #include <sys/strsubr.h>
37 #include <sys/suntpi.h>
38 #include <sys/xti_inet.h>
39 #include <sys/cmn_err.h>
40 #include <sys/kmem.h>
41 #include <sys/cred.h>
42 #include <sys/policy.h>
43 #include <sys/priv.h>
44 #include <sys/ucred.h>
45 #include <sys/zone.h>
46 
47 #include <sys/sockio.h>
48 #include <sys/socket.h>
49 #include <sys/socketvar.h>
50 #include <sys/vtrace.h>
51 #include <sys/sdt.h>
52 #include <sys/debug.h>
53 #include <sys/isa_defs.h>
54 #include <sys/random.h>
55 #include <netinet/in.h>
56 #include <netinet/ip6.h>
57 #include <netinet/icmp6.h>
58 #include <netinet/udp.h>
59 
60 #include <inet/common.h>
61 #include <inet/ip.h>
62 #include <inet/ip_impl.h>
63 #include <inet/ipsec_impl.h>
64 #include <inet/ip6.h>
65 #include <inet/ip_ire.h>
66 #include <inet/ip_if.h>
67 #include <inet/ip_multi.h>
68 #include <inet/ip_ndp.h>
69 #include <inet/proto_set.h>
70 #include <inet/mib2.h>
71 #include <inet/nd.h>
72 #include <inet/optcom.h>
73 #include <inet/snmpcom.h>
74 #include <inet/kstatcom.h>
75 #include <inet/ipclassifier.h>
76 
77 #include <sys/tsol/label.h>
78 #include <sys/tsol/tnet.h>
79 
80 #include <inet/rawip_impl.h>
81 
82 #include <sys/disp.h>
83 
84 /*
85  * Synchronization notes:
86  *
87  * RAWIP is MT and uses the usual kernel synchronization primitives. We use
88  * conn_lock to protect the icmp_t.
89  *
90  * Plumbing notes:
91  * ICMP is always a device driver. For compatibility with mibopen() code
92  * it is possible to I_PUSH "icmp", but that results in pushing a passthrough
93  * dummy module.
94  */
95 
96 static void	icmp_addr_req(queue_t *q, mblk_t *mp);
97 static void	icmp_tpi_bind(queue_t *q, mblk_t *mp);
98 static void	icmp_bind_proto(icmp_t *icmp);
99 static int	icmp_build_hdr_template(conn_t *, const in6_addr_t *,
100     const in6_addr_t *, uint32_t);
101 static void	icmp_capability_req(queue_t *q, mblk_t *mp);
102 static int	icmp_close(queue_t *q, int flags);
103 static void	icmp_close_free(conn_t *);
104 static void	icmp_tpi_connect(queue_t *q, mblk_t *mp);
105 static void	icmp_tpi_disconnect(queue_t *q, mblk_t *mp);
106 static void	icmp_err_ack(queue_t *q, mblk_t *mp, t_scalar_t t_error,
107     int sys_error);
108 static void	icmp_err_ack_prim(queue_t *q, mblk_t *mp, t_scalar_t primitive,
109     t_scalar_t tlierr, int sys_error);
110 static void	icmp_icmp_input(void *arg1, mblk_t *mp, void *arg2,
111     ip_recv_attr_t *);
112 static void	icmp_icmp_error_ipv6(conn_t *connp, mblk_t *mp,
113     ip_recv_attr_t *);
114 static void	icmp_info_req(queue_t *q, mblk_t *mp);
115 static void	icmp_input(void *, mblk_t *, void *, ip_recv_attr_t *);
116 static conn_t 	*icmp_open(int family, cred_t *credp, int *err, int flags);
117 static int	icmp_openv4(queue_t *q, dev_t *devp, int flag, int sflag,
118 		    cred_t *credp);
119 static int	icmp_openv6(queue_t *q, dev_t *devp, int flag, int sflag,
120 		    cred_t *credp);
121 static boolean_t icmp_opt_allow_udr_set(t_scalar_t level, t_scalar_t name);
122 int		icmp_opt_set(conn_t *connp, uint_t optset_context,
123 		    int level, int name, uint_t inlen,
124 		    uchar_t *invalp, uint_t *outlenp, uchar_t *outvalp,
125 		    void *thisdg_attrs, cred_t *cr);
126 int		icmp_opt_get(conn_t *connp, int level, int name,
127 		    uchar_t *ptr);
128 static int	icmp_output_newdst(conn_t *connp, mblk_t *data_mp, sin_t *sin,
129 		    sin6_t *sin6, cred_t *cr, pid_t pid, ip_xmit_attr_t *ixa);
130 static mblk_t	*icmp_prepend_hdr(conn_t *, ip_xmit_attr_t *, const ip_pkt_t *,
131     const in6_addr_t *, const in6_addr_t *, uint32_t, mblk_t *, int *);
132 static mblk_t	*icmp_prepend_header_template(conn_t *, ip_xmit_attr_t *,
133     mblk_t *, const in6_addr_t *, uint32_t, int *);
134 static int	icmp_snmp_set(queue_t *q, t_scalar_t level, t_scalar_t name,
135 		    uchar_t *ptr, int len);
136 static void	icmp_ud_err(queue_t *q, mblk_t *mp, t_scalar_t err);
137 static void	icmp_tpi_unbind(queue_t *q, mblk_t *mp);
138 static void	icmp_wput(queue_t *q, mblk_t *mp);
139 static void	icmp_wput_fallback(queue_t *q, mblk_t *mp);
140 static void	icmp_wput_other(queue_t *q, mblk_t *mp);
141 static void	icmp_wput_iocdata(queue_t *q, mblk_t *mp);
142 static void	icmp_wput_restricted(queue_t *q, mblk_t *mp);
143 static void	icmp_ulp_recv(conn_t *, mblk_t *, uint_t);
144 
145 static void	*rawip_stack_init(netstackid_t stackid, netstack_t *ns);
146 static void	rawip_stack_fini(netstackid_t stackid, void *arg);
147 
148 static void	*rawip_kstat_init(netstackid_t stackid);
149 static void	rawip_kstat_fini(netstackid_t stackid, kstat_t *ksp);
150 static int	rawip_kstat_update(kstat_t *kp, int rw);
151 static void	rawip_stack_shutdown(netstackid_t stackid, void *arg);
152 
153 /* Common routines for TPI and socket module */
154 static conn_t	*rawip_do_open(int, cred_t *, int *, int);
155 static void	rawip_do_close(conn_t *);
156 static int	rawip_do_bind(conn_t *, struct sockaddr *, socklen_t);
157 static int	rawip_do_unbind(conn_t *);
158 static int	rawip_do_connect(conn_t *, const struct sockaddr *, socklen_t,
159     cred_t *, pid_t);
160 
161 int		rawip_getsockname(sock_lower_handle_t, struct sockaddr *,
162 		    socklen_t *, cred_t *);
163 int		rawip_getpeername(sock_lower_handle_t, struct sockaddr *,
164 		    socklen_t *, cred_t *);
165 
166 static struct module_info icmp_mod_info =  {
167 	5707, "icmp", 1, INFPSZ, 512, 128
168 };
169 
170 /*
171  * Entry points for ICMP as a device.
172  * We have separate open functions for the /dev/icmp and /dev/icmp6 devices.
173  */
174 static struct qinit icmprinitv4 = {
175 	NULL, NULL, icmp_openv4, icmp_close, NULL, &icmp_mod_info
176 };
177 
178 static struct qinit icmprinitv6 = {
179 	NULL, NULL, icmp_openv6, icmp_close, NULL, &icmp_mod_info
180 };
181 
182 static struct qinit icmpwinit = {
183 	(pfi_t)icmp_wput, (pfi_t)ip_wsrv, NULL, NULL, NULL, &icmp_mod_info
184 };
185 
186 /* ICMP entry point during fallback */
187 static struct qinit icmp_fallback_sock_winit = {
188 	(pfi_t)icmp_wput_fallback, NULL, NULL, NULL, NULL, &icmp_mod_info
189 };
190 
191 /* For AF_INET aka /dev/icmp */
192 struct streamtab icmpinfov4 = {
193 	&icmprinitv4, &icmpwinit
194 };
195 
196 /* For AF_INET6 aka /dev/icmp6 */
197 struct streamtab icmpinfov6 = {
198 	&icmprinitv6, &icmpwinit
199 };
200 
201 /* Default structure copied into T_INFO_ACK messages */
202 static struct T_info_ack icmp_g_t_info_ack = {
203 	T_INFO_ACK,
204 	IP_MAXPACKET,	 /* TSDU_size.  icmp allows maximum size messages. */
205 	T_INVALID,	/* ETSDU_size.  icmp does not support expedited data. */
206 	T_INVALID,	/* CDATA_size. icmp does not support connect data. */
207 	T_INVALID,	/* DDATA_size. icmp does not support disconnect data. */
208 	0,		/* ADDR_size - filled in later. */
209 	0,		/* OPT_size - not initialized here */
210 	IP_MAXPACKET,	/* TIDU_size.  icmp allows maximum size messages. */
211 	T_CLTS,		/* SERV_type.  icmp supports connection-less. */
212 	TS_UNBND,	/* CURRENT_state.  This is set from icmp_state. */
213 	(XPG4_1|SENDZERO) /* PROVIDER_flag */
214 };
215 
216 /*
217  * All of these are alterable, within the min/max values given, at run time.
218  *
219  * Note: All those tunables which do not start with "icmp_" are Committed and
220  * therefore are public. See PSARC 2009/306.
221  */
222 static mod_prop_info_t icmp_propinfo_tbl[] = {
223 	/* tunable - 0 */
224 	{ "icmp_wroff_extra", MOD_PROTO_RAWIP,
225 	    mod_set_uint32, mod_get_uint32,
226 	    {0, 128, 32}, {32} },
227 
228 	{ "icmp_ipv4_ttl", MOD_PROTO_RAWIP,
229 	    mod_set_uint32, mod_get_uint32,
230 	    {1, 255, 255}, {255} },
231 
232 	{ "icmp_ipv6_hoplimit", MOD_PROTO_RAWIP,
233 	    mod_set_uint32, mod_get_uint32,
234 	    {0, IPV6_MAX_HOPS, IPV6_DEFAULT_HOPS},
235 	    {IPV6_DEFAULT_HOPS} },
236 
237 	{ "icmp_bsd_compat", MOD_PROTO_RAWIP,
238 	    mod_set_boolean, mod_get_boolean,
239 	    {B_TRUE}, {B_TRUE} },
240 
241 	{ "send_maxbuf", MOD_PROTO_RAWIP,
242 	    mod_set_uint32, mod_get_uint32,
243 	    {4096, 65536, 8192}, {8192} },
244 
245 	{ "icmp_xmit_lowat", MOD_PROTO_RAWIP,
246 	    mod_set_uint32, mod_get_uint32,
247 	    {0, 65536, 1024}, {1024} },
248 
249 	{ "recv_maxbuf", MOD_PROTO_RAWIP,
250 	    mod_set_uint32, mod_get_uint32,
251 	    {4096, 65536, 8192}, {8192} },
252 
253 	{ "icmp_max_buf", MOD_PROTO_RAWIP,
254 	    mod_set_uint32, mod_get_uint32,
255 	    {65536, 1024*1024*1024, 256*1024}, {256 * 1024} },
256 
257 	{ "icmp_pmtu_discovery", MOD_PROTO_RAWIP,
258 	    mod_set_boolean, mod_get_boolean,
259 	    {B_FALSE}, {B_FALSE} },
260 
261 	{ "icmp_sendto_ignerr", MOD_PROTO_RAWIP,
262 	    mod_set_boolean, mod_get_boolean,
263 	    {B_FALSE}, {B_FALSE} },
264 
265 	{ "?", MOD_PROTO_RAWIP, NULL, mod_get_allprop, {0}, {0} },
266 
267 	{ NULL, 0, NULL, NULL, {0}, {0} }
268 };
269 
270 #define	is_wroff_extra			is_propinfo_tbl[0].prop_cur_uval
271 #define	is_ipv4_ttl			is_propinfo_tbl[1].prop_cur_uval
272 #define	is_ipv6_hoplimit		is_propinfo_tbl[2].prop_cur_uval
273 #define	is_bsd_compat			is_propinfo_tbl[3].prop_cur_bval
274 #define	is_xmit_hiwat			is_propinfo_tbl[4].prop_cur_uval
275 #define	is_xmit_lowat			is_propinfo_tbl[5].prop_cur_uval
276 #define	is_recv_hiwat			is_propinfo_tbl[6].prop_cur_uval
277 #define	is_max_buf			is_propinfo_tbl[7].prop_cur_uval
278 #define	is_pmtu_discovery		is_propinfo_tbl[8].prop_cur_bval
279 #define	is_sendto_ignerr		is_propinfo_tbl[9].prop_cur_bval
280 
281 typedef union T_primitives *t_primp_t;
282 
283 /*
284  * This routine is called to handle each O_T_BIND_REQ/T_BIND_REQ message
285  * passed to icmp_wput.
286  * It calls IP to verify the local IP address, and calls IP to insert
287  * the conn_t in the fanout table.
288  * If everything is ok it then sends the T_BIND_ACK back up.
289  */
290 static void
291 icmp_tpi_bind(queue_t *q, mblk_t *mp)
292 {
293 	int	error;
294 	struct sockaddr *sa;
295 	struct T_bind_req *tbr;
296 	socklen_t	len;
297 	sin_t	*sin;
298 	sin6_t	*sin6;
299 	icmp_t		*icmp;
300 	conn_t	*connp = Q_TO_CONN(q);
301 	mblk_t *mp1;
302 	cred_t *cr;
303 
304 	/*
305 	 * All Solaris components should pass a db_credp
306 	 * for this TPI message, hence we ASSERT.
307 	 * But in case there is some other M_PROTO that looks
308 	 * like a TPI message sent by some other kernel
309 	 * component, we check and return an error.
310 	 */
311 	cr = msg_getcred(mp, NULL);
312 	ASSERT(cr != NULL);
313 	if (cr == NULL) {
314 		icmp_err_ack(q, mp, TSYSERR, EINVAL);
315 		return;
316 	}
317 
318 	icmp = connp->conn_icmp;
319 	if ((mp->b_wptr - mp->b_rptr) < sizeof (*tbr)) {
320 		(void) mi_strlog(q, 1, SL_ERROR|SL_TRACE,
321 		    "icmp_bind: bad req, len %u",
322 		    (uint_t)(mp->b_wptr - mp->b_rptr));
323 		icmp_err_ack(q, mp, TPROTO, 0);
324 		return;
325 	}
326 
327 	if (icmp->icmp_state != TS_UNBND) {
328 		(void) mi_strlog(q, 1, SL_ERROR|SL_TRACE,
329 		    "icmp_bind: bad state, %u", icmp->icmp_state);
330 		icmp_err_ack(q, mp, TOUTSTATE, 0);
331 		return;
332 	}
333 
334 	/*
335 	 * Reallocate the message to make sure we have enough room for an
336 	 * address.
337 	 */
338 	mp1 = reallocb(mp, sizeof (struct T_bind_ack) + sizeof (sin6_t), 1);
339 	if (mp1 == NULL) {
340 		icmp_err_ack(q, mp, TSYSERR, ENOMEM);
341 		return;
342 	}
343 	mp = mp1;
344 
345 	/* Reset the message type in preparation for shipping it back. */
346 	DB_TYPE(mp) = M_PCPROTO;
347 	tbr = (struct T_bind_req *)mp->b_rptr;
348 	len = tbr->ADDR_length;
349 	switch (len) {
350 	case 0:	/* request for a generic port */
351 		tbr->ADDR_offset = sizeof (struct T_bind_req);
352 		if (connp->conn_family == AF_INET) {
353 			tbr->ADDR_length = sizeof (sin_t);
354 			sin = (sin_t *)&tbr[1];
355 			*sin = sin_null;
356 			sin->sin_family = AF_INET;
357 			mp->b_wptr = (uchar_t *)&sin[1];
358 			sa = (struct sockaddr *)sin;
359 			len = sizeof (sin_t);
360 		} else {
361 			ASSERT(connp->conn_family == AF_INET6);
362 			tbr->ADDR_length = sizeof (sin6_t);
363 			sin6 = (sin6_t *)&tbr[1];
364 			*sin6 = sin6_null;
365 			sin6->sin6_family = AF_INET6;
366 			mp->b_wptr = (uchar_t *)&sin6[1];
367 			sa = (struct sockaddr *)sin6;
368 			len = sizeof (sin6_t);
369 		}
370 		break;
371 
372 	case sizeof (sin_t):	/* Complete IPv4 address */
373 		sa = (struct sockaddr *)mi_offset_param(mp, tbr->ADDR_offset,
374 		    sizeof (sin_t));
375 		break;
376 
377 	case sizeof (sin6_t):	/* Complete IPv6 address */
378 		sa = (struct sockaddr *)mi_offset_param(mp,
379 		    tbr->ADDR_offset, sizeof (sin6_t));
380 		break;
381 
382 	default:
383 		(void) mi_strlog(q, 1, SL_ERROR|SL_TRACE,
384 		    "icmp_bind: bad ADDR_length %u", tbr->ADDR_length);
385 		icmp_err_ack(q, mp, TBADADDR, 0);
386 		return;
387 	}
388 
389 	error = rawip_do_bind(connp, sa, len);
390 	if (error != 0) {
391 		if (error > 0) {
392 			icmp_err_ack(q, mp, TSYSERR, error);
393 		} else {
394 			icmp_err_ack(q, mp, -error, 0);
395 		}
396 	} else {
397 		tbr->PRIM_type = T_BIND_ACK;
398 		qreply(q, mp);
399 	}
400 }
401 
402 static int
403 rawip_do_bind(conn_t *connp, struct sockaddr *sa, socklen_t len)
404 {
405 	sin_t		*sin;
406 	sin6_t		*sin6;
407 	icmp_t		*icmp = connp->conn_icmp;
408 	int		error = 0;
409 	ip_laddr_t	laddr_type = IPVL_UNICAST_UP;	/* INADDR_ANY */
410 	in_port_t	lport;		/* Network byte order */
411 	ipaddr_t	v4src;		/* Set if AF_INET */
412 	in6_addr_t	v6src;
413 	uint_t		scopeid = 0;
414 	zoneid_t	zoneid = IPCL_ZONEID(connp);
415 	ip_stack_t	*ipst = connp->conn_netstack->netstack_ip;
416 
417 	if (sa == NULL || !OK_32PTR((char *)sa)) {
418 		return (EINVAL);
419 	}
420 
421 	switch (len) {
422 	case sizeof (sin_t):    /* Complete IPv4 address */
423 		sin = (sin_t *)sa;
424 		if (sin->sin_family != AF_INET ||
425 		    connp->conn_family != AF_INET) {
426 			/* TSYSERR, EAFNOSUPPORT */
427 			return (EAFNOSUPPORT);
428 		}
429 		v4src = sin->sin_addr.s_addr;
430 		IN6_IPADDR_TO_V4MAPPED(v4src, &v6src);
431 		if (v4src != INADDR_ANY) {
432 			laddr_type = ip_laddr_verify_v4(v4src, zoneid, ipst,
433 			    B_TRUE);
434 		}
435 		lport = sin->sin_port;
436 		break;
437 	case sizeof (sin6_t): /* Complete IPv6 address */
438 		sin6 = (sin6_t *)sa;
439 		if (sin6->sin6_family != AF_INET6 ||
440 		    connp->conn_family != AF_INET6) {
441 			/* TSYSERR, EAFNOSUPPORT */
442 			return (EAFNOSUPPORT);
443 		}
444 		/* No support for mapped addresses on raw sockets */
445 		if (IN6_IS_ADDR_V4MAPPED(&sin6->sin6_addr)) {
446 			/* TSYSERR, EADDRNOTAVAIL */
447 			return (EADDRNOTAVAIL);
448 		}
449 		v6src = sin6->sin6_addr;
450 		if (!IN6_IS_ADDR_UNSPECIFIED(&v6src)) {
451 			if (IN6_IS_ADDR_LINKSCOPE(&v6src))
452 				scopeid = sin6->sin6_scope_id;
453 			laddr_type = ip_laddr_verify_v6(&v6src, zoneid, ipst,
454 			    B_TRUE, scopeid);
455 		}
456 		lport = sin6->sin6_port;
457 		break;
458 
459 	default:
460 		/* TBADADDR */
461 		return (EADDRNOTAVAIL);
462 	}
463 
464 	/* Is the local address a valid unicast, multicast, or broadcast? */
465 	if (laddr_type == IPVL_BAD)
466 		return (EADDRNOTAVAIL);
467 
468 	/*
469 	 * The state must be TS_UNBND.
470 	 */
471 	mutex_enter(&connp->conn_lock);
472 	if (icmp->icmp_state != TS_UNBND) {
473 		mutex_exit(&connp->conn_lock);
474 		return (-TOUTSTATE);
475 	}
476 
477 	/*
478 	 * Copy the source address into our icmp structure.  This address
479 	 * may still be zero; if so, ip will fill in the correct address
480 	 * each time an outbound packet is passed to it.
481 	 * If we are binding to a broadcast or multicast address then
482 	 * we just set the conn_bound_addr since we don't want to use
483 	 * that as the source address when sending.
484 	 */
485 	connp->conn_bound_addr_v6 = v6src;
486 	connp->conn_laddr_v6 = v6src;
487 	if (scopeid != 0) {
488 		connp->conn_ixa->ixa_flags |= IXAF_SCOPEID_SET;
489 		connp->conn_ixa->ixa_scopeid = scopeid;
490 		connp->conn_incoming_ifindex = scopeid;
491 	} else {
492 		connp->conn_ixa->ixa_flags &= ~IXAF_SCOPEID_SET;
493 		connp->conn_incoming_ifindex = connp->conn_bound_if;
494 	}
495 
496 	switch (laddr_type) {
497 	case IPVL_UNICAST_UP:
498 	case IPVL_UNICAST_DOWN:
499 		connp->conn_saddr_v6 = v6src;
500 		connp->conn_mcbc_bind = B_FALSE;
501 		break;
502 	case IPVL_MCAST:
503 	case IPVL_BCAST:
504 		/* ip_set_destination will pick a source address later */
505 		connp->conn_saddr_v6 = ipv6_all_zeros;
506 		connp->conn_mcbc_bind = B_TRUE;
507 		break;
508 	}
509 
510 	/* Any errors after this point should use late_error */
511 
512 	/*
513 	 * Use sin_port/sin6_port since applications like psh use SOCK_RAW
514 	 * with IPPROTO_TCP.
515 	 */
516 	connp->conn_lport = lport;
517 	connp->conn_fport = 0;
518 
519 	if (connp->conn_family == AF_INET) {
520 		ASSERT(connp->conn_ipversion == IPV4_VERSION);
521 	} else {
522 		ASSERT(connp->conn_ipversion == IPV6_VERSION);
523 	}
524 
525 	icmp->icmp_state = TS_IDLE;
526 
527 	/*
528 	 * We create an initial header template here to make a subsequent
529 	 * sendto have a starting point. Since conn_last_dst is zero the
530 	 * first sendto will always follow the 'dst changed' code path.
531 	 * Note that we defer massaging options and the related checksum
532 	 * adjustment until we have a destination address.
533 	 */
534 	error = icmp_build_hdr_template(connp, &connp->conn_saddr_v6,
535 	    &connp->conn_faddr_v6, connp->conn_flowinfo);
536 	if (error != 0) {
537 		mutex_exit(&connp->conn_lock);
538 		goto late_error;
539 	}
540 	/* Just in case */
541 	connp->conn_faddr_v6 = ipv6_all_zeros;
542 	connp->conn_v6lastdst = ipv6_all_zeros;
543 	mutex_exit(&connp->conn_lock);
544 
545 	error = ip_laddr_fanout_insert(connp);
546 	if (error != 0)
547 		goto late_error;
548 
549 	/* Bind succeeded */
550 	return (0);
551 
552 late_error:
553 	mutex_enter(&connp->conn_lock);
554 	connp->conn_saddr_v6 = ipv6_all_zeros;
555 	connp->conn_bound_addr_v6 = ipv6_all_zeros;
556 	connp->conn_laddr_v6 = ipv6_all_zeros;
557 	if (scopeid != 0) {
558 		connp->conn_ixa->ixa_flags &= ~IXAF_SCOPEID_SET;
559 		connp->conn_incoming_ifindex = connp->conn_bound_if;
560 	}
561 	icmp->icmp_state = TS_UNBND;
562 	connp->conn_v6lastdst = ipv6_all_zeros;
563 	connp->conn_lport = 0;
564 
565 	/* Restore the header that was built above - different source address */
566 	(void) icmp_build_hdr_template(connp, &connp->conn_saddr_v6,
567 	    &connp->conn_faddr_v6, connp->conn_flowinfo);
568 	mutex_exit(&connp->conn_lock);
569 	return (error);
570 }
571 
572 /*
573  * Tell IP to just bind to the protocol.
574  */
575 static void
576 icmp_bind_proto(icmp_t *icmp)
577 {
578 	conn_t	*connp = icmp->icmp_connp;
579 
580 	mutex_enter(&connp->conn_lock);
581 	connp->conn_saddr_v6 = ipv6_all_zeros;
582 	connp->conn_laddr_v6 = ipv6_all_zeros;
583 	connp->conn_faddr_v6 = ipv6_all_zeros;
584 	connp->conn_v6lastdst = ipv6_all_zeros;
585 	mutex_exit(&connp->conn_lock);
586 
587 	(void) ip_laddr_fanout_insert(connp);
588 }
589 
590 /*
591  * This routine handles each T_CONN_REQ message passed to icmp.  It
592  * associates a default destination address with the stream.
593  *
594  * After various error checks are completed, icmp_connect() lays
595  * the target address and port into the composite header template.
596  * Then we ask IP for information, including a source address if we didn't
597  * already have one. Finally we send up the T_OK_ACK reply message.
598  */
599 static void
600 icmp_tpi_connect(queue_t *q, mblk_t *mp)
601 {
602 	conn_t	*connp = Q_TO_CONN(q);
603 	struct T_conn_req	*tcr;
604 	struct sockaddr *sa;
605 	socklen_t len;
606 	int error;
607 	cred_t *cr;
608 	pid_t pid;
609 	/*
610 	 * All Solaris components should pass a db_credp
611 	 * for this TPI message, hence we ASSERT.
612 	 * But in case there is some other M_PROTO that looks
613 	 * like a TPI message sent by some other kernel
614 	 * component, we check and return an error.
615 	 */
616 	cr = msg_getcred(mp, &pid);
617 	ASSERT(cr != NULL);
618 	if (cr == NULL) {
619 		icmp_err_ack(q, mp, TSYSERR, EINVAL);
620 		return;
621 	}
622 
623 	tcr = (struct T_conn_req *)mp->b_rptr;
624 	/* Sanity checks */
625 	if ((mp->b_wptr - mp->b_rptr) < sizeof (struct T_conn_req)) {
626 		icmp_err_ack(q, mp, TPROTO, 0);
627 		return;
628 	}
629 
630 	if (tcr->OPT_length != 0) {
631 		icmp_err_ack(q, mp, TBADOPT, 0);
632 		return;
633 	}
634 
635 	len = tcr->DEST_length;
636 
637 	switch (len) {
638 	default:
639 		icmp_err_ack(q, mp, TBADADDR, 0);
640 		return;
641 	case sizeof (sin_t):
642 		sa = (struct sockaddr *)mi_offset_param(mp, tcr->DEST_offset,
643 		    sizeof (sin_t));
644 		break;
645 	case sizeof (sin6_t):
646 		sa = (struct sockaddr *)mi_offset_param(mp,
647 		    tcr->DEST_offset, sizeof (sin6_t));
648 		break;
649 	}
650 
651 	error = proto_verify_ip_addr(connp->conn_family, sa, len);
652 	if (error != 0) {
653 		icmp_err_ack(q, mp, TSYSERR, error);
654 		return;
655 	}
656 
657 	error = rawip_do_connect(connp, sa, len, cr, pid);
658 	if (error != 0) {
659 		if (error < 0) {
660 			icmp_err_ack(q, mp, -error, 0);
661 		} else {
662 			icmp_err_ack(q, mp, 0, error);
663 		}
664 	} else {
665 		mblk_t *mp1;
666 
667 		/*
668 		 * We have to send a connection confirmation to
669 		 * keep TLI happy.
670 		 */
671 		if (connp->conn_family == AF_INET) {
672 			mp1 = mi_tpi_conn_con(NULL, (char *)sa,
673 			    sizeof (sin_t), NULL, 0);
674 		} else {
675 			ASSERT(connp->conn_family == AF_INET6);
676 			mp1 = mi_tpi_conn_con(NULL, (char *)sa,
677 			    sizeof (sin6_t), NULL, 0);
678 		}
679 		if (mp1 == NULL) {
680 			icmp_err_ack(q, mp, TSYSERR, ENOMEM);
681 			return;
682 		}
683 
684 		/*
685 		 * Send ok_ack for T_CONN_REQ
686 		 */
687 		mp = mi_tpi_ok_ack_alloc(mp);
688 		if (mp == NULL) {
689 			/* Unable to reuse the T_CONN_REQ for the ack. */
690 			icmp_err_ack_prim(q, mp1, T_CONN_REQ, TSYSERR, ENOMEM);
691 			return;
692 		}
693 		putnext(connp->conn_rq, mp);
694 		putnext(connp->conn_rq, mp1);
695 	}
696 }
697 
698 static int
699 rawip_do_connect(conn_t *connp, const struct sockaddr *sa, socklen_t len,
700     cred_t *cr, pid_t pid)
701 {
702 	icmp_t		*icmp;
703 	sin_t		*sin;
704 	sin6_t		*sin6;
705 	int		error;
706 	uint16_t 	dstport;
707 	ipaddr_t	v4dst;
708 	in6_addr_t	v6dst;
709 	uint32_t	flowinfo;
710 	ip_xmit_attr_t	*ixa;
711 	ip_xmit_attr_t	*oldixa;
712 	uint_t		scopeid = 0;
713 	uint_t		srcid = 0;
714 	in6_addr_t	v6src = connp->conn_saddr_v6;
715 
716 	icmp = connp->conn_icmp;
717 
718 	if (sa == NULL || !OK_32PTR((char *)sa)) {
719 		return (EINVAL);
720 	}
721 
722 	ASSERT(sa != NULL && len != 0);
723 
724 	/*
725 	 * Determine packet type based on type of address passed in
726 	 * the request should contain an IPv4 or IPv6 address.
727 	 * Make sure that address family matches the type of
728 	 * family of the address passed down.
729 	 */
730 	switch (len) {
731 	case sizeof (sin_t):
732 		sin = (sin_t *)sa;
733 
734 		v4dst = sin->sin_addr.s_addr;
735 		dstport = sin->sin_port;
736 		IN6_IPADDR_TO_V4MAPPED(v4dst, &v6dst);
737 		ASSERT(connp->conn_ipversion == IPV4_VERSION);
738 		break;
739 
740 	case sizeof (sin6_t):
741 		sin6 = (sin6_t *)sa;
742 
743 		/* No support for mapped addresses on raw sockets */
744 		if (IN6_IS_ADDR_V4MAPPED(&sin6->sin6_addr)) {
745 			return (EADDRNOTAVAIL);
746 		}
747 		v6dst = sin6->sin6_addr;
748 		dstport = sin6->sin6_port;
749 		ASSERT(connp->conn_ipversion == IPV6_VERSION);
750 		flowinfo = sin6->sin6_flowinfo;
751 		if (IN6_IS_ADDR_LINKLOCAL(&sin6->sin6_addr))
752 			scopeid = sin6->sin6_scope_id;
753 		srcid = sin6->__sin6_src_id;
754 		if (srcid != 0 && IN6_IS_ADDR_UNSPECIFIED(&v6src)) {
755 			ip_srcid_find_id(srcid, &v6src, IPCL_ZONEID(connp),
756 			    connp->conn_netstack);
757 		}
758 		break;
759 	}
760 
761 	/*
762 	 * If there is a different thread using conn_ixa then we get a new
763 	 * copy and cut the old one loose from conn_ixa. Otherwise we use
764 	 * conn_ixa and prevent any other thread from using/changing it.
765 	 * Once connect() is done other threads can use conn_ixa since the
766 	 * refcnt will be back at one.
767 	 * We defer updating conn_ixa until later to handle any concurrent
768 	 * conn_ixa_cleanup thread.
769 	 */
770 	ixa = conn_get_ixa(connp, B_FALSE);
771 	if (ixa == NULL)
772 		return (ENOMEM);
773 
774 	ASSERT(ixa->ixa_refcnt >= 2);
775 	ASSERT(ixa == connp->conn_ixa);
776 
777 	mutex_enter(&connp->conn_lock);
778 	/*
779 	 * This icmp_t must have bound already before doing a connect.
780 	 * Reject if a connect is in progress (we drop conn_lock during
781 	 * rawip_do_connect).
782 	 */
783 	if (icmp->icmp_state == TS_UNBND || icmp->icmp_state == TS_WCON_CREQ) {
784 		mutex_exit(&connp->conn_lock);
785 		ixa_refrele(ixa);
786 		return (-TOUTSTATE);
787 	}
788 
789 	if (icmp->icmp_state == TS_DATA_XFER) {
790 		/* Already connected - clear out state */
791 		if (connp->conn_mcbc_bind)
792 			connp->conn_saddr_v6 = ipv6_all_zeros;
793 		else
794 			connp->conn_saddr_v6 = connp->conn_bound_addr_v6;
795 		connp->conn_laddr_v6 = connp->conn_bound_addr_v6;
796 		connp->conn_faddr_v6 = ipv6_all_zeros;
797 		icmp->icmp_state = TS_IDLE;
798 	}
799 
800 	/*
801 	 * Use sin_port/sin6_port since applications like psh use SOCK_RAW
802 	 * with IPPROTO_TCP.
803 	 */
804 	connp->conn_fport = dstport;
805 	if (connp->conn_ipversion == IPV4_VERSION) {
806 		/*
807 		 * Interpret a zero destination to mean loopback.
808 		 * Update the T_CONN_REQ (sin/sin6) since it is used to
809 		 * generate the T_CONN_CON.
810 		 */
811 		if (v4dst == INADDR_ANY) {
812 			v4dst = htonl(INADDR_LOOPBACK);
813 			IN6_IPADDR_TO_V4MAPPED(v4dst, &v6dst);
814 			ASSERT(connp->conn_family == AF_INET);
815 			sin->sin_addr.s_addr = v4dst;
816 		}
817 		connp->conn_faddr_v6 = v6dst;
818 		connp->conn_flowinfo = 0;
819 	} else {
820 		ASSERT(connp->conn_ipversion == IPV6_VERSION);
821 		/*
822 		 * Interpret a zero destination to mean loopback.
823 		 * Update the T_CONN_REQ (sin/sin6) since it is used to
824 		 * generate the T_CONN_CON.
825 		 */
826 		if (IN6_IS_ADDR_UNSPECIFIED(&v6dst)) {
827 			v6dst = ipv6_loopback;
828 			sin6->sin6_addr = v6dst;
829 		}
830 		connp->conn_faddr_v6 = v6dst;
831 		connp->conn_flowinfo = flowinfo;
832 	}
833 
834 	/*
835 	 * We update our cred/cpid based on the caller of connect
836 	 */
837 	if (connp->conn_cred != cr) {
838 		crhold(cr);
839 		crfree(connp->conn_cred);
840 		connp->conn_cred = cr;
841 	}
842 	connp->conn_cpid = pid;
843 	ASSERT(!(ixa->ixa_free_flags & IXA_FREE_CRED));
844 	ixa->ixa_cred = cr;
845 	ixa->ixa_cpid = pid;
846 	if (is_system_labeled()) {
847 		/* We need to restart with a label based on the cred */
848 		ip_xmit_attr_restore_tsl(ixa, ixa->ixa_cred);
849 	}
850 
851 	if (scopeid != 0) {
852 		ixa->ixa_flags |= IXAF_SCOPEID_SET;
853 		ixa->ixa_scopeid = scopeid;
854 		connp->conn_incoming_ifindex = scopeid;
855 	} else {
856 		ixa->ixa_flags &= ~IXAF_SCOPEID_SET;
857 		connp->conn_incoming_ifindex = connp->conn_bound_if;
858 	}
859 
860 	/*
861 	 * conn_connect will drop conn_lock and reacquire it.
862 	 * To prevent a send* from messing with this icmp_t while the lock
863 	 * is dropped we set icmp_state and clear conn_v6lastdst.
864 	 * That will make all send* fail with EISCONN.
865 	 */
866 	connp->conn_v6lastdst = ipv6_all_zeros;
867 	icmp->icmp_state = TS_WCON_CREQ;
868 
869 	error = conn_connect(connp, NULL, IPDF_ALLOW_MCBC);
870 	mutex_exit(&connp->conn_lock);
871 	if (error != 0)
872 		goto connect_failed;
873 
874 	/*
875 	 * The addresses have been verified. Time to insert in
876 	 * the correct fanout list.
877 	 */
878 	error = ipcl_conn_insert(connp);
879 	if (error != 0)
880 		goto connect_failed;
881 
882 	mutex_enter(&connp->conn_lock);
883 	error = icmp_build_hdr_template(connp, &connp->conn_saddr_v6,
884 	    &connp->conn_faddr_v6, connp->conn_flowinfo);
885 	if (error != 0) {
886 		mutex_exit(&connp->conn_lock);
887 		goto connect_failed;
888 	}
889 
890 	icmp->icmp_state = TS_DATA_XFER;
891 	/* Record this as the "last" send even though we haven't sent any */
892 	connp->conn_v6lastdst = connp->conn_faddr_v6;
893 	connp->conn_lastipversion = connp->conn_ipversion;
894 	connp->conn_lastdstport = connp->conn_fport;
895 	connp->conn_lastflowinfo = connp->conn_flowinfo;
896 	connp->conn_lastscopeid = scopeid;
897 	connp->conn_lastsrcid = srcid;
898 	/* Also remember a source to use together with lastdst */
899 	connp->conn_v6lastsrc = v6src;
900 
901 	oldixa = conn_replace_ixa(connp, ixa);
902 	mutex_exit(&connp->conn_lock);
903 	ixa_refrele(oldixa);
904 
905 	ixa_refrele(ixa);
906 	return (0);
907 
908 connect_failed:
909 	if (ixa != NULL)
910 		ixa_refrele(ixa);
911 	mutex_enter(&connp->conn_lock);
912 	icmp->icmp_state = TS_IDLE;
913 	/* In case the source address was set above */
914 	if (connp->conn_mcbc_bind)
915 		connp->conn_saddr_v6 = ipv6_all_zeros;
916 	else
917 		connp->conn_saddr_v6 = connp->conn_bound_addr_v6;
918 	connp->conn_laddr_v6 = connp->conn_bound_addr_v6;
919 	connp->conn_faddr_v6 = ipv6_all_zeros;
920 	connp->conn_v6lastdst = ipv6_all_zeros;
921 	connp->conn_flowinfo = 0;
922 
923 	(void) icmp_build_hdr_template(connp, &connp->conn_saddr_v6,
924 	    &connp->conn_faddr_v6, connp->conn_flowinfo);
925 	mutex_exit(&connp->conn_lock);
926 	return (error);
927 }
928 
929 static void
930 rawip_do_close(conn_t *connp)
931 {
932 	ASSERT(connp != NULL && IPCL_IS_RAWIP(connp));
933 
934 	ip_quiesce_conn(connp);
935 
936 	if (!IPCL_IS_NONSTR(connp)) {
937 		qprocsoff(connp->conn_rq);
938 	}
939 
940 	icmp_close_free(connp);
941 
942 	/*
943 	 * Now we are truly single threaded on this stream, and can
944 	 * delete the things hanging off the connp, and finally the connp.
945 	 * We removed this connp from the fanout list, it cannot be
946 	 * accessed thru the fanouts, and we already waited for the
947 	 * conn_ref to drop to 0. We are already in close, so
948 	 * there cannot be any other thread from the top. qprocsoff
949 	 * has completed, and service has completed or won't run in
950 	 * future.
951 	 */
952 	ASSERT(connp->conn_ref == 1);
953 
954 	if (!IPCL_IS_NONSTR(connp)) {
955 		inet_minor_free(connp->conn_minor_arena, connp->conn_dev);
956 	} else {
957 		ip_free_helper_stream(connp);
958 	}
959 
960 	connp->conn_ref--;
961 	ipcl_conn_destroy(connp);
962 }
963 
964 static int
965 icmp_close(queue_t *q, int flags)
966 {
967 	conn_t  *connp;
968 
969 	if (flags & SO_FALLBACK) {
970 		/*
971 		 * stream is being closed while in fallback
972 		 * simply free the resources that were allocated
973 		 */
974 		inet_minor_free(WR(q)->q_ptr, (dev_t)(RD(q)->q_ptr));
975 		qprocsoff(q);
976 		goto done;
977 	}
978 
979 	connp = Q_TO_CONN(q);
980 	(void) rawip_do_close(connp);
981 done:
982 	q->q_ptr = WR(q)->q_ptr = NULL;
983 	return (0);
984 }
985 
986 static void
987 icmp_close_free(conn_t *connp)
988 {
989 	icmp_t *icmp = connp->conn_icmp;
990 
991 	if (icmp->icmp_filter != NULL) {
992 		kmem_free(icmp->icmp_filter, sizeof (icmp6_filter_t));
993 		icmp->icmp_filter = NULL;
994 	}
995 
996 	/*
997 	 * Clear any fields which the kmem_cache constructor clears.
998 	 * Only icmp_connp needs to be preserved.
999 	 * TBD: We should make this more efficient to avoid clearing
1000 	 * everything.
1001 	 */
1002 	ASSERT(icmp->icmp_connp == connp);
1003 	bzero(icmp, sizeof (icmp_t));
1004 	icmp->icmp_connp = connp;
1005 }
1006 
1007 /*
1008  * This routine handles each T_DISCON_REQ message passed to icmp
1009  * as an indicating that ICMP is no longer connected. This results
1010  * in telling IP to restore the binding to just the local address.
1011  */
1012 static int
1013 icmp_do_disconnect(conn_t *connp)
1014 {
1015 	icmp_t	*icmp = connp->conn_icmp;
1016 	int	error;
1017 
1018 	mutex_enter(&connp->conn_lock);
1019 	if (icmp->icmp_state != TS_DATA_XFER) {
1020 		mutex_exit(&connp->conn_lock);
1021 		return (-TOUTSTATE);
1022 	}
1023 	if (connp->conn_mcbc_bind)
1024 		connp->conn_saddr_v6 = ipv6_all_zeros;
1025 	else
1026 		connp->conn_saddr_v6 = connp->conn_bound_addr_v6;
1027 	connp->conn_laddr_v6 = connp->conn_bound_addr_v6;
1028 	connp->conn_faddr_v6 = ipv6_all_zeros;
1029 	icmp->icmp_state = TS_IDLE;
1030 
1031 	connp->conn_v6lastdst = ipv6_all_zeros;
1032 	error = icmp_build_hdr_template(connp, &connp->conn_saddr_v6,
1033 	    &connp->conn_faddr_v6, connp->conn_flowinfo);
1034 	mutex_exit(&connp->conn_lock);
1035 	if (error != 0)
1036 		return (error);
1037 
1038 	/*
1039 	 * Tell IP to remove the full binding and revert
1040 	 * to the local address binding.
1041 	 */
1042 	return (ip_laddr_fanout_insert(connp));
1043 }
1044 
1045 static void
1046 icmp_tpi_disconnect(queue_t *q, mblk_t *mp)
1047 {
1048 	conn_t	*connp = Q_TO_CONN(q);
1049 	int	error;
1050 
1051 	/*
1052 	 * Allocate the largest primitive we need to send back
1053 	 * T_error_ack is > than T_ok_ack
1054 	 */
1055 	mp = reallocb(mp, sizeof (struct T_error_ack), 1);
1056 	if (mp == NULL) {
1057 		/* Unable to reuse the T_DISCON_REQ for the ack. */
1058 		icmp_err_ack_prim(q, mp, T_DISCON_REQ, TSYSERR, ENOMEM);
1059 		return;
1060 	}
1061 
1062 	error = icmp_do_disconnect(connp);
1063 
1064 	if (error != 0) {
1065 		if (error > 0) {
1066 			icmp_err_ack(q, mp, 0, error);
1067 		} else {
1068 			icmp_err_ack(q, mp, -error, 0);
1069 		}
1070 	} else {
1071 		mp = mi_tpi_ok_ack_alloc(mp);
1072 		ASSERT(mp != NULL);
1073 		qreply(q, mp);
1074 	}
1075 }
1076 
1077 static int
1078 icmp_disconnect(conn_t *connp)
1079 {
1080 	int	error;
1081 
1082 	connp->conn_dgram_errind = B_FALSE;
1083 
1084 	error = icmp_do_disconnect(connp);
1085 
1086 	if (error < 0)
1087 		error = proto_tlitosyserr(-error);
1088 	return (error);
1089 }
1090 
1091 /* This routine creates a T_ERROR_ACK message and passes it upstream. */
1092 static void
1093 icmp_err_ack(queue_t *q, mblk_t *mp, t_scalar_t t_error, int sys_error)
1094 {
1095 	if ((mp = mi_tpi_err_ack_alloc(mp, t_error, sys_error)) != NULL)
1096 		qreply(q, mp);
1097 }
1098 
1099 /* Shorthand to generate and send TPI error acks to our client */
1100 static void
1101 icmp_err_ack_prim(queue_t *q, mblk_t *mp, t_scalar_t primitive,
1102     t_scalar_t t_error, int sys_error)
1103 {
1104 	struct T_error_ack	*teackp;
1105 
1106 	if ((mp = tpi_ack_alloc(mp, sizeof (struct T_error_ack),
1107 	    M_PCPROTO, T_ERROR_ACK)) != NULL) {
1108 		teackp = (struct T_error_ack *)mp->b_rptr;
1109 		teackp->ERROR_prim = primitive;
1110 		teackp->TLI_error = t_error;
1111 		teackp->UNIX_error = sys_error;
1112 		qreply(q, mp);
1113 	}
1114 }
1115 
1116 /*
1117  * icmp_icmp_input is called as conn_recvicmp to process ICMP messages.
1118  * Generates the appropriate T_UDERROR_IND for permanent (non-transient) errors.
1119  * Assumes that IP has pulled up everything up to and including the ICMP header.
1120  */
1121 /* ARGSUSED2 */
1122 static void
1123 icmp_icmp_input(void *arg1, mblk_t *mp, void *arg2, ip_recv_attr_t *ira)
1124 {
1125 	conn_t		*connp = (conn_t *)arg1;
1126 	icmp_t		*icmp = connp->conn_icmp;
1127 	icmph_t		*icmph;
1128 	ipha_t		*ipha;
1129 	int		iph_hdr_length;
1130 	sin_t		sin;
1131 	mblk_t		*mp1;
1132 	int		error = 0;
1133 
1134 	ipha = (ipha_t *)mp->b_rptr;
1135 
1136 	ASSERT(OK_32PTR(mp->b_rptr));
1137 
1138 	if (IPH_HDR_VERSION(ipha) != IPV4_VERSION) {
1139 		ASSERT(IPH_HDR_VERSION(ipha) == IPV6_VERSION);
1140 		icmp_icmp_error_ipv6(connp, mp, ira);
1141 		return;
1142 	}
1143 	ASSERT(IPH_HDR_VERSION(ipha) == IPV4_VERSION);
1144 
1145 	/* Skip past the outer IP and ICMP headers */
1146 	ASSERT(IPH_HDR_LENGTH(ipha) == ira->ira_ip_hdr_length);
1147 	iph_hdr_length = ira->ira_ip_hdr_length;
1148 	icmph = (icmph_t *)&mp->b_rptr[iph_hdr_length];
1149 	ipha = (ipha_t *)&icmph[1];	/* Inner IP header */
1150 
1151 	iph_hdr_length = IPH_HDR_LENGTH(ipha);
1152 
1153 	switch (icmph->icmph_type) {
1154 	case ICMP_DEST_UNREACHABLE:
1155 		switch (icmph->icmph_code) {
1156 		case ICMP_FRAGMENTATION_NEEDED: {
1157 			ipha_t		*ipha;
1158 			ip_xmit_attr_t	*ixa;
1159 			/*
1160 			 * IP has already adjusted the path MTU.
1161 			 * But we need to adjust DF for IPv4.
1162 			 */
1163 			if (connp->conn_ipversion != IPV4_VERSION)
1164 				break;
1165 
1166 			ixa = conn_get_ixa(connp, B_FALSE);
1167 			if (ixa == NULL || ixa->ixa_ire == NULL) {
1168 				/*
1169 				 * Some other thread holds conn_ixa. We will
1170 				 * redo this on the next ICMP too big.
1171 				 */
1172 				if (ixa != NULL)
1173 					ixa_refrele(ixa);
1174 				break;
1175 			}
1176 			(void) ip_get_pmtu(ixa);
1177 
1178 			mutex_enter(&connp->conn_lock);
1179 			ipha = (ipha_t *)connp->conn_ht_iphc;
1180 			if (ixa->ixa_flags & IXAF_PMTU_IPV4_DF) {
1181 				ipha->ipha_fragment_offset_and_flags |=
1182 				    IPH_DF_HTONS;
1183 			} else {
1184 				ipha->ipha_fragment_offset_and_flags &=
1185 				    ~IPH_DF_HTONS;
1186 			}
1187 			mutex_exit(&connp->conn_lock);
1188 			ixa_refrele(ixa);
1189 			break;
1190 		}
1191 		case ICMP_PORT_UNREACHABLE:
1192 		case ICMP_PROTOCOL_UNREACHABLE:
1193 			error = ECONNREFUSED;
1194 			break;
1195 		default:
1196 			/* Transient errors */
1197 			break;
1198 		}
1199 		break;
1200 	default:
1201 		/* Transient errors */
1202 		break;
1203 	}
1204 	if (error == 0) {
1205 		freemsg(mp);
1206 		return;
1207 	}
1208 
1209 	/*
1210 	 * Deliver T_UDERROR_IND when the application has asked for it.
1211 	 * The socket layer enables this automatically when connected.
1212 	 */
1213 	if (!connp->conn_dgram_errind) {
1214 		freemsg(mp);
1215 		return;
1216 	}
1217 
1218 	sin = sin_null;
1219 	sin.sin_family = AF_INET;
1220 	sin.sin_addr.s_addr = ipha->ipha_dst;
1221 
1222 	if (IPCL_IS_NONSTR(connp)) {
1223 		mutex_enter(&connp->conn_lock);
1224 		if (icmp->icmp_state == TS_DATA_XFER) {
1225 			if (sin.sin_addr.s_addr == connp->conn_faddr_v4) {
1226 				mutex_exit(&connp->conn_lock);
1227 				(*connp->conn_upcalls->su_set_error)
1228 				    (connp->conn_upper_handle, error);
1229 				goto done;
1230 			}
1231 		} else {
1232 			icmp->icmp_delayed_error = error;
1233 			*((sin_t *)&icmp->icmp_delayed_addr) = sin;
1234 		}
1235 		mutex_exit(&connp->conn_lock);
1236 	} else {
1237 		mp1 = mi_tpi_uderror_ind((char *)&sin, sizeof (sin_t), NULL, 0,
1238 		    error);
1239 		if (mp1 != NULL)
1240 			putnext(connp->conn_rq, mp1);
1241 	}
1242 done:
1243 	freemsg(mp);
1244 }
1245 
1246 /*
1247  * icmp_icmp_error_ipv6 is called by icmp_icmp_error to process ICMP for IPv6.
1248  * Generates the appropriate T_UDERROR_IND for permanent (non-transient) errors.
1249  * Assumes that IP has pulled up all the extension headers as well as the
1250  * ICMPv6 header.
1251  */
1252 static void
1253 icmp_icmp_error_ipv6(conn_t *connp, mblk_t *mp, ip_recv_attr_t *ira)
1254 {
1255 	icmp6_t		*icmp6;
1256 	ip6_t		*ip6h, *outer_ip6h;
1257 	uint16_t	iph_hdr_length;
1258 	uint8_t		*nexthdrp;
1259 	sin6_t		sin6;
1260 	mblk_t		*mp1;
1261 	int		error = 0;
1262 	icmp_t		*icmp = connp->conn_icmp;
1263 
1264 	outer_ip6h = (ip6_t *)mp->b_rptr;
1265 #ifdef DEBUG
1266 	if (outer_ip6h->ip6_nxt != IPPROTO_ICMPV6)
1267 		iph_hdr_length = ip_hdr_length_v6(mp, outer_ip6h);
1268 	else
1269 		iph_hdr_length = IPV6_HDR_LEN;
1270 	ASSERT(iph_hdr_length == ira->ira_ip_hdr_length);
1271 #endif
1272 	/* Skip past the outer IP and ICMP headers */
1273 	iph_hdr_length = ira->ira_ip_hdr_length;
1274 	icmp6 = (icmp6_t *)&mp->b_rptr[iph_hdr_length];
1275 
1276 	ip6h = (ip6_t *)&icmp6[1];	/* Inner IP header */
1277 	if (!ip_hdr_length_nexthdr_v6(mp, ip6h, &iph_hdr_length, &nexthdrp)) {
1278 		freemsg(mp);
1279 		return;
1280 	}
1281 
1282 	switch (icmp6->icmp6_type) {
1283 	case ICMP6_DST_UNREACH:
1284 		switch (icmp6->icmp6_code) {
1285 		case ICMP6_DST_UNREACH_NOPORT:
1286 			error = ECONNREFUSED;
1287 			break;
1288 		case ICMP6_DST_UNREACH_ADMIN:
1289 		case ICMP6_DST_UNREACH_NOROUTE:
1290 		case ICMP6_DST_UNREACH_BEYONDSCOPE:
1291 		case ICMP6_DST_UNREACH_ADDR:
1292 			/* Transient errors */
1293 			break;
1294 		default:
1295 			break;
1296 		}
1297 		break;
1298 	case ICMP6_PACKET_TOO_BIG: {
1299 		struct T_unitdata_ind	*tudi;
1300 		struct T_opthdr		*toh;
1301 		size_t			udi_size;
1302 		mblk_t			*newmp;
1303 		t_scalar_t		opt_length = sizeof (struct T_opthdr) +
1304 		    sizeof (struct ip6_mtuinfo);
1305 		sin6_t			*sin6;
1306 		struct ip6_mtuinfo	*mtuinfo;
1307 
1308 		/*
1309 		 * If the application has requested to receive path mtu
1310 		 * information, send up an empty message containing an
1311 		 * IPV6_PATHMTU ancillary data item.
1312 		 */
1313 		if (!connp->conn_ipv6_recvpathmtu)
1314 			break;
1315 
1316 		udi_size = sizeof (struct T_unitdata_ind) + sizeof (sin6_t) +
1317 		    opt_length;
1318 		if ((newmp = allocb(udi_size, BPRI_MED)) == NULL) {
1319 			BUMP_MIB(&icmp->icmp_is->is_rawip_mib, rawipInErrors);
1320 			break;
1321 		}
1322 
1323 		/*
1324 		 * newmp->b_cont is left to NULL on purpose.  This is an
1325 		 * empty message containing only ancillary data.
1326 		 */
1327 		newmp->b_datap->db_type = M_PROTO;
1328 		tudi = (struct T_unitdata_ind *)newmp->b_rptr;
1329 		newmp->b_wptr = (uchar_t *)tudi + udi_size;
1330 		tudi->PRIM_type = T_UNITDATA_IND;
1331 		tudi->SRC_length = sizeof (sin6_t);
1332 		tudi->SRC_offset = sizeof (struct T_unitdata_ind);
1333 		tudi->OPT_offset = tudi->SRC_offset + sizeof (sin6_t);
1334 		tudi->OPT_length = opt_length;
1335 
1336 		sin6 = (sin6_t *)&tudi[1];
1337 		bzero(sin6, sizeof (sin6_t));
1338 		sin6->sin6_family = AF_INET6;
1339 		sin6->sin6_addr = connp->conn_faddr_v6;
1340 
1341 		toh = (struct T_opthdr *)&sin6[1];
1342 		toh->level = IPPROTO_IPV6;
1343 		toh->name = IPV6_PATHMTU;
1344 		toh->len = opt_length;
1345 		toh->status = 0;
1346 
1347 		mtuinfo = (struct ip6_mtuinfo *)&toh[1];
1348 		bzero(mtuinfo, sizeof (struct ip6_mtuinfo));
1349 		mtuinfo->ip6m_addr.sin6_family = AF_INET6;
1350 		mtuinfo->ip6m_addr.sin6_addr = ip6h->ip6_dst;
1351 		mtuinfo->ip6m_mtu = icmp6->icmp6_mtu;
1352 		/*
1353 		 * We've consumed everything we need from the original
1354 		 * message.  Free it, then send our empty message.
1355 		 */
1356 		freemsg(mp);
1357 		icmp_ulp_recv(connp, newmp, msgdsize(newmp));
1358 		return;
1359 	}
1360 	case ICMP6_TIME_EXCEEDED:
1361 		/* Transient errors */
1362 		break;
1363 	case ICMP6_PARAM_PROB:
1364 		/* If this corresponds to an ICMP_PROTOCOL_UNREACHABLE */
1365 		if (icmp6->icmp6_code == ICMP6_PARAMPROB_NEXTHEADER &&
1366 		    (uchar_t *)ip6h + icmp6->icmp6_pptr ==
1367 		    (uchar_t *)nexthdrp) {
1368 			error = ECONNREFUSED;
1369 			break;
1370 		}
1371 		break;
1372 	}
1373 	if (error == 0) {
1374 		freemsg(mp);
1375 		return;
1376 	}
1377 
1378 	/*
1379 	 * Deliver T_UDERROR_IND when the application has asked for it.
1380 	 * The socket layer enables this automatically when connected.
1381 	 */
1382 	if (!connp->conn_dgram_errind) {
1383 		freemsg(mp);
1384 		return;
1385 	}
1386 
1387 	sin6 = sin6_null;
1388 	sin6.sin6_family = AF_INET6;
1389 	sin6.sin6_addr = ip6h->ip6_dst;
1390 	sin6.sin6_flowinfo = ip6h->ip6_vcf & ~IPV6_VERS_AND_FLOW_MASK;
1391 	if (IPCL_IS_NONSTR(connp)) {
1392 		mutex_enter(&connp->conn_lock);
1393 		if (icmp->icmp_state == TS_DATA_XFER) {
1394 			if (IN6_ARE_ADDR_EQUAL(&sin6.sin6_addr,
1395 			    &connp->conn_faddr_v6)) {
1396 				mutex_exit(&connp->conn_lock);
1397 				(*connp->conn_upcalls->su_set_error)
1398 				    (connp->conn_upper_handle, error);
1399 				goto done;
1400 			}
1401 		} else {
1402 			icmp->icmp_delayed_error = error;
1403 			*((sin6_t *)&icmp->icmp_delayed_addr) = sin6;
1404 		}
1405 		mutex_exit(&connp->conn_lock);
1406 	} else {
1407 		mp1 = mi_tpi_uderror_ind((char *)&sin6, sizeof (sin6_t),
1408 		    NULL, 0, error);
1409 		if (mp1 != NULL)
1410 			putnext(connp->conn_rq, mp1);
1411 	}
1412 done:
1413 	freemsg(mp);
1414 }
1415 
1416 /*
1417  * This routine responds to T_ADDR_REQ messages.  It is called by icmp_wput.
1418  * The local address is filled in if endpoint is bound. The remote address
1419  * is filled in if remote address has been precified ("connected endpoint")
1420  * (The concept of connected CLTS sockets is alien to published TPI
1421  *  but we support it anyway).
1422  */
1423 static void
1424 icmp_addr_req(queue_t *q, mblk_t *mp)
1425 {
1426 	struct sockaddr *sa;
1427 	mblk_t	*ackmp;
1428 	struct T_addr_ack *taa;
1429 	icmp_t	*icmp = Q_TO_ICMP(q);
1430 	conn_t	*connp = icmp->icmp_connp;
1431 	uint_t	addrlen;
1432 
1433 	/* Make it large enough for worst case */
1434 	ackmp = reallocb(mp, sizeof (struct T_addr_ack) +
1435 	    2 * sizeof (sin6_t), 1);
1436 	if (ackmp == NULL) {
1437 		icmp_err_ack(q, mp, TSYSERR, ENOMEM);
1438 		return;
1439 	}
1440 	taa = (struct T_addr_ack *)ackmp->b_rptr;
1441 
1442 	bzero(taa, sizeof (struct T_addr_ack));
1443 	ackmp->b_wptr = (uchar_t *)&taa[1];
1444 
1445 	taa->PRIM_type = T_ADDR_ACK;
1446 	ackmp->b_datap->db_type = M_PCPROTO;
1447 
1448 	if (connp->conn_family == AF_INET)
1449 		addrlen = sizeof (sin_t);
1450 	else
1451 		addrlen = sizeof (sin6_t);
1452 
1453 	mutex_enter(&connp->conn_lock);
1454 	/*
1455 	 * Note: Following code assumes 32 bit alignment of basic
1456 	 * data structures like sin_t and struct T_addr_ack.
1457 	 */
1458 	if (icmp->icmp_state != TS_UNBND) {
1459 		/*
1460 		 * Fill in local address first
1461 		 */
1462 		taa->LOCADDR_offset = sizeof (*taa);
1463 		taa->LOCADDR_length = addrlen;
1464 		sa = (struct sockaddr *)&taa[1];
1465 		(void) conn_getsockname(connp, sa, &addrlen);
1466 		ackmp->b_wptr += addrlen;
1467 	}
1468 	if (icmp->icmp_state == TS_DATA_XFER) {
1469 		/*
1470 		 * connected, fill remote address too
1471 		 */
1472 		taa->REMADDR_length = addrlen;
1473 		/* assumed 32-bit alignment */
1474 		taa->REMADDR_offset = taa->LOCADDR_offset + taa->LOCADDR_length;
1475 		sa = (struct sockaddr *)(ackmp->b_rptr + taa->REMADDR_offset);
1476 		(void) conn_getpeername(connp, sa, &addrlen);
1477 		ackmp->b_wptr += addrlen;
1478 	}
1479 	mutex_exit(&connp->conn_lock);
1480 	ASSERT(ackmp->b_wptr <= ackmp->b_datap->db_lim);
1481 	qreply(q, ackmp);
1482 }
1483 
1484 static void
1485 icmp_copy_info(struct T_info_ack *tap, icmp_t *icmp)
1486 {
1487 	conn_t		*connp = icmp->icmp_connp;
1488 
1489 	*tap = icmp_g_t_info_ack;
1490 
1491 	if (connp->conn_family == AF_INET6)
1492 		tap->ADDR_size = sizeof (sin6_t);
1493 	else
1494 		tap->ADDR_size = sizeof (sin_t);
1495 	tap->CURRENT_state = icmp->icmp_state;
1496 	tap->OPT_size = icmp_max_optsize;
1497 }
1498 
1499 static void
1500 icmp_do_capability_ack(icmp_t *icmp, struct T_capability_ack *tcap,
1501     t_uscalar_t cap_bits1)
1502 {
1503 	tcap->CAP_bits1 = 0;
1504 
1505 	if (cap_bits1 & TC1_INFO) {
1506 		icmp_copy_info(&tcap->INFO_ack, icmp);
1507 		tcap->CAP_bits1 |= TC1_INFO;
1508 	}
1509 }
1510 
1511 /*
1512  * This routine responds to T_CAPABILITY_REQ messages.  It is called by
1513  * icmp_wput.  Much of the T_CAPABILITY_ACK information is copied from
1514  * icmp_g_t_info_ack.  The current state of the stream is copied from
1515  * icmp_state.
1516  */
1517 static void
1518 icmp_capability_req(queue_t *q, mblk_t *mp)
1519 {
1520 	icmp_t			*icmp = Q_TO_ICMP(q);
1521 	t_uscalar_t		cap_bits1;
1522 	struct T_capability_ack	*tcap;
1523 
1524 	cap_bits1 = ((struct T_capability_req *)mp->b_rptr)->CAP_bits1;
1525 
1526 	mp = tpi_ack_alloc(mp, sizeof (struct T_capability_ack),
1527 	    mp->b_datap->db_type, T_CAPABILITY_ACK);
1528 	if (!mp)
1529 		return;
1530 
1531 	tcap = (struct T_capability_ack *)mp->b_rptr;
1532 
1533 	icmp_do_capability_ack(icmp, tcap, cap_bits1);
1534 
1535 	qreply(q, mp);
1536 }
1537 
1538 /*
1539  * This routine responds to T_INFO_REQ messages.  It is called by icmp_wput.
1540  * Most of the T_INFO_ACK information is copied from icmp_g_t_info_ack.
1541  * The current state of the stream is copied from icmp_state.
1542  */
1543 static void
1544 icmp_info_req(queue_t *q, mblk_t *mp)
1545 {
1546 	icmp_t	*icmp = Q_TO_ICMP(q);
1547 
1548 	/* Create a T_INFO_ACK message. */
1549 	mp = tpi_ack_alloc(mp, sizeof (struct T_info_ack), M_PCPROTO,
1550 	    T_INFO_ACK);
1551 	if (!mp)
1552 		return;
1553 	icmp_copy_info((struct T_info_ack *)mp->b_rptr, icmp);
1554 	qreply(q, mp);
1555 }
1556 
1557 static int
1558 icmp_tpi_open(queue_t *q, dev_t *devp, int flag, int sflag, cred_t *credp,
1559     int family)
1560 {
1561 	conn_t *connp;
1562 	dev_t	conn_dev;
1563 	int	error;
1564 
1565 	/* If the stream is already open, return immediately. */
1566 	if (q->q_ptr != NULL)
1567 		return (0);
1568 
1569 	if (sflag == MODOPEN)
1570 		return (EINVAL);
1571 
1572 	/*
1573 	 * Since ICMP is not used so heavily, allocating from the small
1574 	 * arena should be sufficient.
1575 	 */
1576 	if ((conn_dev = inet_minor_alloc(ip_minor_arena_sa)) == 0) {
1577 		return (EBUSY);
1578 	}
1579 
1580 	if (flag & SO_FALLBACK) {
1581 		/*
1582 		 * Non streams socket needs a stream to fallback to
1583 		 */
1584 		RD(q)->q_ptr = (void *)conn_dev;
1585 		WR(q)->q_qinfo = &icmp_fallback_sock_winit;
1586 		WR(q)->q_ptr = (void *)ip_minor_arena_sa;
1587 		qprocson(q);
1588 		return (0);
1589 	}
1590 
1591 	connp = rawip_do_open(family, credp, &error, KM_SLEEP);
1592 	if (connp == NULL) {
1593 		ASSERT(error != 0);
1594 		inet_minor_free(ip_minor_arena_sa, connp->conn_dev);
1595 		return (error);
1596 	}
1597 
1598 	*devp = makedevice(getemajor(*devp), (minor_t)conn_dev);
1599 	connp->conn_dev = conn_dev;
1600 	connp->conn_minor_arena = ip_minor_arena_sa;
1601 
1602 	/*
1603 	 * Initialize the icmp_t structure for this stream.
1604 	 */
1605 	q->q_ptr = connp;
1606 	WR(q)->q_ptr = connp;
1607 	connp->conn_rq = q;
1608 	connp->conn_wq = WR(q);
1609 
1610 	WR(q)->q_hiwat = connp->conn_sndbuf;
1611 	WR(q)->q_lowat = connp->conn_sndlowat;
1612 
1613 	qprocson(q);
1614 
1615 	/* Set the Stream head write offset. */
1616 	(void) proto_set_tx_wroff(q, connp, connp->conn_wroff);
1617 	(void) proto_set_rx_hiwat(connp->conn_rq, connp, connp->conn_rcvbuf);
1618 
1619 	mutex_enter(&connp->conn_lock);
1620 	connp->conn_state_flags &= ~CONN_INCIPIENT;
1621 	mutex_exit(&connp->conn_lock);
1622 
1623 	icmp_bind_proto(connp->conn_icmp);
1624 
1625 	return (0);
1626 }
1627 
1628 /* For /dev/icmp aka AF_INET open */
1629 static int
1630 icmp_openv4(queue_t *q, dev_t *devp, int flag, int sflag, cred_t *credp)
1631 {
1632 	return (icmp_tpi_open(q, devp, flag, sflag, credp, AF_INET));
1633 }
1634 
1635 /* For /dev/icmp6 aka AF_INET6 open */
1636 static int
1637 icmp_openv6(queue_t *q, dev_t *devp, int flag, int sflag, cred_t *credp)
1638 {
1639 	return (icmp_tpi_open(q, devp, flag, sflag, credp, AF_INET6));
1640 }
1641 
1642 /*
1643  * This is the open routine for icmp.  It allocates a icmp_t structure for
1644  * the stream and, on the first open of the module, creates an ND table.
1645  */
1646 static conn_t *
1647 rawip_do_open(int family, cred_t *credp, int *err, int flags)
1648 {
1649 	icmp_t	*icmp;
1650 	conn_t *connp;
1651 	zoneid_t zoneid;
1652 	netstack_t *ns;
1653 	icmp_stack_t *is;
1654 	int len;
1655 	boolean_t isv6 = B_FALSE;
1656 
1657 	*err = secpolicy_net_icmpaccess(credp);
1658 	if (*err != 0)
1659 		return (NULL);
1660 
1661 	if (family == AF_INET6)
1662 		isv6 = B_TRUE;
1663 
1664 	ns = netstack_find_by_cred(credp);
1665 	ASSERT(ns != NULL);
1666 	is = ns->netstack_icmp;
1667 	ASSERT(is != NULL);
1668 
1669 	/*
1670 	 * For exclusive stacks we set the zoneid to zero
1671 	 * to make ICMP operate as if in the global zone.
1672 	 */
1673 	if (ns->netstack_stackid != GLOBAL_NETSTACKID)
1674 		zoneid = GLOBAL_ZONEID;
1675 	else
1676 		zoneid = crgetzoneid(credp);
1677 
1678 	ASSERT(flags == KM_SLEEP || flags == KM_NOSLEEP);
1679 
1680 	connp = ipcl_conn_create(IPCL_RAWIPCONN, flags, ns);
1681 	icmp = connp->conn_icmp;
1682 
1683 	/*
1684 	 * ipcl_conn_create did a netstack_hold. Undo the hold that was
1685 	 * done by netstack_find_by_cred()
1686 	 */
1687 	netstack_rele(ns);
1688 
1689 	/*
1690 	 * Since this conn_t/icmp_t is not yet visible to anybody else we don't
1691 	 * need to lock anything.
1692 	 */
1693 	ASSERT(connp->conn_proto == IPPROTO_ICMP);
1694 	ASSERT(connp->conn_icmp == icmp);
1695 	ASSERT(icmp->icmp_connp == connp);
1696 
1697 	/* Set the initial state of the stream and the privilege status. */
1698 	icmp->icmp_state = TS_UNBND;
1699 	connp->conn_ixa->ixa_flags |= IXAF_VERIFY_SOURCE;
1700 	if (isv6) {
1701 		connp->conn_family = AF_INET6;
1702 		connp->conn_ipversion = IPV6_VERSION;
1703 		connp->conn_ixa->ixa_flags &= ~IXAF_IS_IPV4;
1704 		connp->conn_proto = IPPROTO_ICMPV6;
1705 		/* May be changed by a SO_PROTOTYPE socket option. */
1706 		connp->conn_proto = IPPROTO_ICMPV6;
1707 		connp->conn_ixa->ixa_protocol = connp->conn_proto;
1708 		connp->conn_ixa->ixa_raw_cksum_offset = 2;
1709 		connp->conn_default_ttl = is->is_ipv6_hoplimit;
1710 		len = sizeof (ip6_t);
1711 	} else {
1712 		connp->conn_family = AF_INET;
1713 		connp->conn_ipversion = IPV4_VERSION;
1714 		connp->conn_ixa->ixa_flags |= IXAF_IS_IPV4;
1715 		/* May be changed by a SO_PROTOTYPE socket option. */
1716 		connp->conn_proto = IPPROTO_ICMP;
1717 		connp->conn_ixa->ixa_protocol = connp->conn_proto;
1718 		connp->conn_default_ttl = is->is_ipv4_ttl;
1719 		len = sizeof (ipha_t);
1720 	}
1721 	connp->conn_xmit_ipp.ipp_unicast_hops = connp->conn_default_ttl;
1722 
1723 	connp->conn_ixa->ixa_multicast_ttl = IP_DEFAULT_MULTICAST_TTL;
1724 
1725 	/*
1726 	 * For the socket of protocol IPPROTO_RAW or when IP_HDRINCL is set,
1727 	 * the checksum is provided in the pre-built packet. We clear
1728 	 * IXAF_SET_ULP_CKSUM to tell IP that the application has sent a
1729 	 * complete IP header and not to compute the transport checksum.
1730 	 */
1731 	connp->conn_ixa->ixa_flags |= IXAF_MULTICAST_LOOP | IXAF_SET_ULP_CKSUM;
1732 	/* conn_allzones can not be set this early, hence no IPCL_ZONEID */
1733 	connp->conn_ixa->ixa_zoneid = zoneid;
1734 
1735 	connp->conn_zoneid = zoneid;
1736 
1737 	/*
1738 	 * If the caller has the process-wide flag set, then default to MAC
1739 	 * exempt mode.  This allows read-down to unlabeled hosts.
1740 	 */
1741 	if (getpflags(NET_MAC_AWARE, credp) != 0)
1742 		connp->conn_mac_mode = CONN_MAC_AWARE;
1743 
1744 	connp->conn_zone_is_global = (crgetzoneid(credp) == GLOBAL_ZONEID);
1745 
1746 	icmp->icmp_is = is;
1747 
1748 	connp->conn_rcvbuf = is->is_recv_hiwat;
1749 	connp->conn_sndbuf = is->is_xmit_hiwat;
1750 	connp->conn_sndlowat = is->is_xmit_lowat;
1751 	connp->conn_rcvlowat = icmp_mod_info.mi_lowat;
1752 
1753 	connp->conn_wroff = len + is->is_wroff_extra;
1754 	connp->conn_so_type = SOCK_RAW;
1755 
1756 	connp->conn_recv = icmp_input;
1757 	connp->conn_recvicmp = icmp_icmp_input;
1758 	crhold(credp);
1759 	connp->conn_cred = credp;
1760 	connp->conn_cpid = curproc->p_pid;
1761 	connp->conn_open_time = ddi_get_lbolt64();
1762 	/* Cache things in ixa without an extra refhold */
1763 	ASSERT(!(connp->conn_ixa->ixa_free_flags & IXA_FREE_CRED));
1764 	connp->conn_ixa->ixa_cred = connp->conn_cred;
1765 	connp->conn_ixa->ixa_cpid = connp->conn_cpid;
1766 	if (is_system_labeled())
1767 		connp->conn_ixa->ixa_tsl = crgetlabel(connp->conn_cred);
1768 
1769 	connp->conn_flow_cntrld = B_FALSE;
1770 
1771 	if (is->is_pmtu_discovery)
1772 		connp->conn_ixa->ixa_flags |= IXAF_PMTU_DISCOVERY;
1773 
1774 	return (connp);
1775 }
1776 
1777 /*
1778  * Which ICMP options OK to set through T_UNITDATA_REQ...
1779  */
1780 /* ARGSUSED */
1781 static boolean_t
1782 icmp_opt_allow_udr_set(t_scalar_t level, t_scalar_t name)
1783 {
1784 	return (B_TRUE);
1785 }
1786 
1787 /*
1788  * This routine gets default values of certain options whose default
1789  * values are maintained by protcol specific code
1790  */
1791 int
1792 icmp_opt_default(queue_t *q, t_scalar_t level, t_scalar_t name, uchar_t *ptr)
1793 {
1794 	icmp_t *icmp = Q_TO_ICMP(q);
1795 	icmp_stack_t *is = icmp->icmp_is;
1796 	int *i1 = (int *)ptr;
1797 
1798 	switch (level) {
1799 	case IPPROTO_IP:
1800 		switch (name) {
1801 		case IP_MULTICAST_TTL:
1802 			*ptr = (uchar_t)IP_DEFAULT_MULTICAST_TTL;
1803 			return (sizeof (uchar_t));
1804 		case IP_MULTICAST_LOOP:
1805 			*ptr = (uchar_t)IP_DEFAULT_MULTICAST_LOOP;
1806 			return (sizeof (uchar_t));
1807 		}
1808 		break;
1809 	case IPPROTO_IPV6:
1810 		switch (name) {
1811 		case IPV6_MULTICAST_HOPS:
1812 			*i1 = IP_DEFAULT_MULTICAST_TTL;
1813 			return (sizeof (int));
1814 		case IPV6_MULTICAST_LOOP:
1815 			*i1 = IP_DEFAULT_MULTICAST_LOOP;
1816 			return (sizeof (int));
1817 		case IPV6_UNICAST_HOPS:
1818 			*i1 = is->is_ipv6_hoplimit;
1819 			return (sizeof (int));
1820 		}
1821 		break;
1822 	case IPPROTO_ICMPV6:
1823 		switch (name) {
1824 		case ICMP6_FILTER:
1825 			/* Make it look like "pass all" */
1826 			ICMP6_FILTER_SETPASSALL((icmp6_filter_t *)ptr);
1827 			return (sizeof (icmp6_filter_t));
1828 		}
1829 		break;
1830 	}
1831 	return (-1);
1832 }
1833 
1834 /*
1835  * This routine retrieves the current status of socket options.
1836  * It returns the size of the option retrieved, or -1.
1837  */
1838 int
1839 icmp_opt_get(conn_t *connp, int level, int name, uchar_t *ptr)
1840 {
1841 	icmp_t		*icmp = connp->conn_icmp;
1842 	int		*i1 = (int *)ptr;
1843 	conn_opt_arg_t	coas;
1844 	int		retval;
1845 
1846 	coas.coa_connp = connp;
1847 	coas.coa_ixa = connp->conn_ixa;
1848 	coas.coa_ipp = &connp->conn_xmit_ipp;
1849 	coas.coa_ancillary = B_FALSE;
1850 	coas.coa_changed = 0;
1851 
1852 	/*
1853 	 * We assume that the optcom framework has checked for the set
1854 	 * of levels and names that are supported, hence we don't worry
1855 	 * about rejecting based on that.
1856 	 * First check for ICMP specific handling, then pass to common routine.
1857 	 */
1858 	switch (level) {
1859 	case IPPROTO_IP:
1860 		/*
1861 		 * Only allow IPv4 option processing on IPv4 sockets.
1862 		 */
1863 		if (connp->conn_family != AF_INET)
1864 			return (-1);
1865 
1866 		switch (name) {
1867 		case IP_OPTIONS:
1868 		case T_IP_OPTIONS:
1869 			/* Options are passed up with each packet */
1870 			return (0);
1871 		case IP_HDRINCL:
1872 			mutex_enter(&connp->conn_lock);
1873 			*i1 = (int)icmp->icmp_hdrincl;
1874 			mutex_exit(&connp->conn_lock);
1875 			return (sizeof (int));
1876 		}
1877 		break;
1878 
1879 	case IPPROTO_IPV6:
1880 		/*
1881 		 * Only allow IPv6 option processing on native IPv6 sockets.
1882 		 */
1883 		if (connp->conn_family != AF_INET6)
1884 			return (-1);
1885 
1886 		switch (name) {
1887 		case IPV6_CHECKSUM:
1888 			/*
1889 			 * Return offset or -1 if no checksum offset.
1890 			 * Does not apply to IPPROTO_ICMPV6
1891 			 */
1892 			if (connp->conn_proto == IPPROTO_ICMPV6)
1893 				return (-1);
1894 
1895 			mutex_enter(&connp->conn_lock);
1896 			if (connp->conn_ixa->ixa_flags & IXAF_SET_RAW_CKSUM)
1897 				*i1 = connp->conn_ixa->ixa_raw_cksum_offset;
1898 			else
1899 				*i1 = -1;
1900 			mutex_exit(&connp->conn_lock);
1901 			return (sizeof (int));
1902 		}
1903 		break;
1904 
1905 	case IPPROTO_ICMPV6:
1906 		/*
1907 		 * Only allow IPv6 option processing on native IPv6 sockets.
1908 		 */
1909 		if (connp->conn_family != AF_INET6)
1910 			return (-1);
1911 
1912 		if (connp->conn_proto != IPPROTO_ICMPV6)
1913 			return (-1);
1914 
1915 		switch (name) {
1916 		case ICMP6_FILTER:
1917 			mutex_enter(&connp->conn_lock);
1918 			if (icmp->icmp_filter == NULL) {
1919 				/* Make it look like "pass all" */
1920 				ICMP6_FILTER_SETPASSALL((icmp6_filter_t *)ptr);
1921 			} else {
1922 				(void) bcopy(icmp->icmp_filter, ptr,
1923 				    sizeof (icmp6_filter_t));
1924 			}
1925 			mutex_exit(&connp->conn_lock);
1926 			return (sizeof (icmp6_filter_t));
1927 		}
1928 	}
1929 	mutex_enter(&connp->conn_lock);
1930 	retval = conn_opt_get(&coas, level, name, ptr);
1931 	mutex_exit(&connp->conn_lock);
1932 	return (retval);
1933 }
1934 
1935 /*
1936  * This routine retrieves the current status of socket options.
1937  * It returns the size of the option retrieved, or -1.
1938  */
1939 int
1940 icmp_tpi_opt_get(queue_t *q, int level, int name, uchar_t *ptr)
1941 {
1942 	conn_t		*connp = Q_TO_CONN(q);
1943 	int 		err;
1944 
1945 	err = icmp_opt_get(connp, level, name, ptr);
1946 	return (err);
1947 }
1948 
1949 /*
1950  * This routine sets socket options.
1951  */
1952 int
1953 icmp_do_opt_set(conn_opt_arg_t *coa, int level, int name,
1954     uint_t inlen, uchar_t *invalp, cred_t *cr, boolean_t checkonly)
1955 {
1956 	conn_t		*connp = coa->coa_connp;
1957 	ip_xmit_attr_t	*ixa = coa->coa_ixa;
1958 	icmp_t		*icmp = connp->conn_icmp;
1959 	icmp_stack_t	*is = icmp->icmp_is;
1960 	int		*i1 = (int *)invalp;
1961 	boolean_t	onoff = (*i1 == 0) ? 0 : 1;
1962 	int		error;
1963 
1964 	ASSERT(MUTEX_NOT_HELD(&coa->coa_connp->conn_lock));
1965 
1966 	/*
1967 	 * For fixed length options, no sanity check
1968 	 * of passed in length is done. It is assumed *_optcom_req()
1969 	 * routines do the right thing.
1970 	 */
1971 
1972 	switch (level) {
1973 	case SOL_SOCKET:
1974 		switch (name) {
1975 		case SO_PROTOTYPE:
1976 			if ((*i1 & 0xFF) != IPPROTO_ICMP &&
1977 			    (*i1 & 0xFF) != IPPROTO_ICMPV6 &&
1978 			    secpolicy_net_rawaccess(cr) != 0) {
1979 				return (EACCES);
1980 			}
1981 			if (checkonly)
1982 				break;
1983 
1984 			mutex_enter(&connp->conn_lock);
1985 			connp->conn_proto = *i1 & 0xFF;
1986 			ixa->ixa_protocol = connp->conn_proto;
1987 			if ((connp->conn_proto == IPPROTO_RAW ||
1988 			    connp->conn_proto == IPPROTO_IGMP) &&
1989 			    connp->conn_family == AF_INET) {
1990 				icmp->icmp_hdrincl = 1;
1991 				ixa->ixa_flags &= ~IXAF_SET_ULP_CKSUM;
1992 			} else if (connp->conn_proto == IPPROTO_UDP ||
1993 			    connp->conn_proto == IPPROTO_TCP ||
1994 			    connp->conn_proto == IPPROTO_SCTP) {
1995 				/* Used by test applications like psh */
1996 				icmp->icmp_hdrincl = 0;
1997 				ixa->ixa_flags &= ~IXAF_SET_ULP_CKSUM;
1998 			} else {
1999 				icmp->icmp_hdrincl = 0;
2000 				ixa->ixa_flags |= IXAF_SET_ULP_CKSUM;
2001 			}
2002 
2003 			if (connp->conn_family == AF_INET6 &&
2004 			    connp->conn_proto == IPPROTO_ICMPV6) {
2005 				/* Set offset for icmp6_cksum */
2006 				ixa->ixa_flags &= ~IXAF_SET_RAW_CKSUM;
2007 				ixa->ixa_raw_cksum_offset = 2;
2008 			}
2009 			if (icmp->icmp_filter != NULL &&
2010 			    connp->conn_proto != IPPROTO_ICMPV6) {
2011 				kmem_free(icmp->icmp_filter,
2012 				    sizeof (icmp6_filter_t));
2013 				icmp->icmp_filter = NULL;
2014 			}
2015 			mutex_exit(&connp->conn_lock);
2016 
2017 			coa->coa_changed |= COA_HEADER_CHANGED;
2018 			/*
2019 			 * For SCTP, we don't use icmp_bind_proto() for
2020 			 * raw socket binding.
2021 			 */
2022 			if (connp->conn_proto == IPPROTO_SCTP)
2023 				return (0);
2024 
2025 			coa->coa_changed |= COA_ICMP_BIND_NEEDED;
2026 			return (0);
2027 
2028 		case SO_SNDBUF:
2029 			if (*i1 > is->is_max_buf) {
2030 				return (ENOBUFS);
2031 			}
2032 			break;
2033 		case SO_RCVBUF:
2034 			if (*i1 > is->is_max_buf) {
2035 				return (ENOBUFS);
2036 			}
2037 			break;
2038 		}
2039 		break;
2040 
2041 	case IPPROTO_IP:
2042 		/*
2043 		 * Only allow IPv4 option processing on IPv4 sockets.
2044 		 */
2045 		if (connp->conn_family != AF_INET)
2046 			return (EINVAL);
2047 
2048 		switch (name) {
2049 		case IP_HDRINCL:
2050 			if (!checkonly) {
2051 				mutex_enter(&connp->conn_lock);
2052 				icmp->icmp_hdrincl = onoff;
2053 				if (onoff)
2054 					ixa->ixa_flags &= ~IXAF_SET_ULP_CKSUM;
2055 				else
2056 					ixa->ixa_flags |= IXAF_SET_ULP_CKSUM;
2057 				mutex_exit(&connp->conn_lock);
2058 			}
2059 			break;
2060 		}
2061 		break;
2062 
2063 	case IPPROTO_IPV6:
2064 		if (connp->conn_family != AF_INET6)
2065 			return (EINVAL);
2066 
2067 		switch (name) {
2068 		case IPV6_CHECKSUM:
2069 			/*
2070 			 * Integer offset into the user data of where the
2071 			 * checksum is located.
2072 			 * Offset of -1 disables option.
2073 			 * Does not apply to IPPROTO_ICMPV6.
2074 			 */
2075 			if (connp->conn_proto == IPPROTO_ICMPV6 ||
2076 			    coa->coa_ancillary) {
2077 				return (EINVAL);
2078 			}
2079 			if ((*i1 != -1) && ((*i1 < 0) || (*i1 & 0x1) != 0)) {
2080 				/* Negative or not 16 bit aligned offset */
2081 				return (EINVAL);
2082 			}
2083 			if (checkonly)
2084 				break;
2085 
2086 			mutex_enter(&connp->conn_lock);
2087 			if (*i1 == -1) {
2088 				ixa->ixa_flags &= ~IXAF_SET_RAW_CKSUM;
2089 				ixa->ixa_raw_cksum_offset = 0;
2090 				ixa->ixa_flags &= ~IXAF_SET_ULP_CKSUM;
2091 			} else {
2092 				ixa->ixa_flags |= IXAF_SET_RAW_CKSUM;
2093 				ixa->ixa_raw_cksum_offset = *i1;
2094 				ixa->ixa_flags |= IXAF_SET_ULP_CKSUM;
2095 			}
2096 			mutex_exit(&connp->conn_lock);
2097 			break;
2098 		}
2099 		break;
2100 
2101 	case IPPROTO_ICMPV6:
2102 		/*
2103 		 * Only allow IPv6 option processing on IPv6 sockets.
2104 		 */
2105 		if (connp->conn_family != AF_INET6)
2106 			return (EINVAL);
2107 		if (connp->conn_proto != IPPROTO_ICMPV6)
2108 			return (EINVAL);
2109 
2110 		switch (name) {
2111 		case ICMP6_FILTER:
2112 			if (checkonly)
2113 				break;
2114 
2115 			if ((inlen != 0) &&
2116 			    (inlen != sizeof (icmp6_filter_t)))
2117 				return (EINVAL);
2118 
2119 			mutex_enter(&connp->conn_lock);
2120 			if (inlen == 0) {
2121 				if (icmp->icmp_filter != NULL) {
2122 					kmem_free(icmp->icmp_filter,
2123 					    sizeof (icmp6_filter_t));
2124 					icmp->icmp_filter = NULL;
2125 				}
2126 			} else {
2127 				if (icmp->icmp_filter == NULL) {
2128 					icmp->icmp_filter = kmem_alloc(
2129 					    sizeof (icmp6_filter_t),
2130 					    KM_NOSLEEP);
2131 					if (icmp->icmp_filter == NULL) {
2132 						mutex_exit(&connp->conn_lock);
2133 						return (ENOBUFS);
2134 					}
2135 				}
2136 				(void) bcopy(invalp, icmp->icmp_filter, inlen);
2137 			}
2138 			mutex_exit(&connp->conn_lock);
2139 			break;
2140 		}
2141 		break;
2142 	}
2143 	error = conn_opt_set(coa, level, name, inlen, invalp,
2144 	    checkonly, cr);
2145 	return (error);
2146 }
2147 
2148 /*
2149  * This routine sets socket options.
2150  */
2151 int
2152 icmp_opt_set(conn_t *connp, uint_t optset_context, int level, int name,
2153     uint_t inlen, uchar_t *invalp, uint_t *outlenp, uchar_t *outvalp,
2154     void *thisdg_attrs, cred_t *cr)
2155 {
2156 	icmp_t		*icmp = connp->conn_icmp;
2157 	int		err;
2158 	conn_opt_arg_t	coas, *coa;
2159 	boolean_t	checkonly;
2160 	icmp_stack_t	*is = icmp->icmp_is;
2161 
2162 	switch (optset_context) {
2163 	case SETFN_OPTCOM_CHECKONLY:
2164 		checkonly = B_TRUE;
2165 		/*
2166 		 * Note: Implies T_CHECK semantics for T_OPTCOM_REQ
2167 		 * inlen != 0 implies value supplied and
2168 		 * 	we have to "pretend" to set it.
2169 		 * inlen == 0 implies that there is no
2170 		 * 	value part in T_CHECK request and just validation
2171 		 * done elsewhere should be enough, we just return here.
2172 		 */
2173 		if (inlen == 0) {
2174 			*outlenp = 0;
2175 			return (0);
2176 		}
2177 		break;
2178 	case SETFN_OPTCOM_NEGOTIATE:
2179 		checkonly = B_FALSE;
2180 		break;
2181 	case SETFN_UD_NEGOTIATE:
2182 	case SETFN_CONN_NEGOTIATE:
2183 		checkonly = B_FALSE;
2184 		/*
2185 		 * Negotiating local and "association-related" options
2186 		 * through T_UNITDATA_REQ.
2187 		 *
2188 		 * Following routine can filter out ones we do not
2189 		 * want to be "set" this way.
2190 		 */
2191 		if (!icmp_opt_allow_udr_set(level, name)) {
2192 			*outlenp = 0;
2193 			return (EINVAL);
2194 		}
2195 		break;
2196 	default:
2197 		/*
2198 		 * We should never get here
2199 		 */
2200 		*outlenp = 0;
2201 		return (EINVAL);
2202 	}
2203 
2204 	ASSERT((optset_context != SETFN_OPTCOM_CHECKONLY) ||
2205 	    (optset_context == SETFN_OPTCOM_CHECKONLY && inlen != 0));
2206 
2207 	if (thisdg_attrs != NULL) {
2208 		/* Options from T_UNITDATA_REQ */
2209 		coa = (conn_opt_arg_t *)thisdg_attrs;
2210 		ASSERT(coa->coa_connp == connp);
2211 		ASSERT(coa->coa_ixa != NULL);
2212 		ASSERT(coa->coa_ipp != NULL);
2213 		ASSERT(coa->coa_ancillary);
2214 	} else {
2215 		coa = &coas;
2216 		coas.coa_connp = connp;
2217 		/* Get a reference on conn_ixa to prevent concurrent mods */
2218 		coas.coa_ixa = conn_get_ixa(connp, B_TRUE);
2219 		if (coas.coa_ixa == NULL) {
2220 			*outlenp = 0;
2221 			return (ENOMEM);
2222 		}
2223 		coas.coa_ipp = &connp->conn_xmit_ipp;
2224 		coas.coa_ancillary = B_FALSE;
2225 		coas.coa_changed = 0;
2226 	}
2227 
2228 	err = icmp_do_opt_set(coa, level, name, inlen, invalp,
2229 	    cr, checkonly);
2230 	if (err != 0) {
2231 errout:
2232 		if (!coa->coa_ancillary)
2233 			ixa_refrele(coa->coa_ixa);
2234 		*outlenp = 0;
2235 		return (err);
2236 	}
2237 
2238 	/*
2239 	 * Common case of OK return with outval same as inval.
2240 	 */
2241 	if (invalp != outvalp) {
2242 		/* don't trust bcopy for identical src/dst */
2243 		(void) bcopy(invalp, outvalp, inlen);
2244 	}
2245 	*outlenp = inlen;
2246 
2247 	/*
2248 	 * If this was not ancillary data, then we rebuild the headers,
2249 	 * update the IRE/NCE, and IPsec as needed.
2250 	 * Since the label depends on the destination we go through
2251 	 * ip_set_destination first.
2252 	 */
2253 	if (coa->coa_ancillary) {
2254 		return (0);
2255 	}
2256 
2257 	if (coa->coa_changed & COA_ROUTE_CHANGED) {
2258 		in6_addr_t saddr, faddr, nexthop;
2259 		in_port_t fport;
2260 
2261 		/*
2262 		 * We clear lastdst to make sure we pick up the change
2263 		 * next time sending.
2264 		 * If we are connected we re-cache the information.
2265 		 * We ignore errors to preserve BSD behavior.
2266 		 * Note that we don't redo IPsec policy lookup here
2267 		 * since the final destination (or source) didn't change.
2268 		 */
2269 		mutex_enter(&connp->conn_lock);
2270 		connp->conn_v6lastdst = ipv6_all_zeros;
2271 
2272 		ip_attr_nexthop(coa->coa_ipp, coa->coa_ixa,
2273 		    &connp->conn_faddr_v6, &nexthop);
2274 		saddr = connp->conn_saddr_v6;
2275 		faddr = connp->conn_faddr_v6;
2276 		fport = connp->conn_fport;
2277 		mutex_exit(&connp->conn_lock);
2278 
2279 		if (!IN6_IS_ADDR_UNSPECIFIED(&faddr) &&
2280 		    !IN6_IS_ADDR_V4MAPPED_ANY(&faddr)) {
2281 			(void) ip_attr_connect(connp, coa->coa_ixa,
2282 			    &saddr, &faddr, &nexthop, fport, NULL, NULL,
2283 			    IPDF_ALLOW_MCBC | IPDF_VERIFY_DST);
2284 		}
2285 	}
2286 
2287 	ixa_refrele(coa->coa_ixa);
2288 
2289 	if (coa->coa_changed & COA_HEADER_CHANGED) {
2290 		/*
2291 		 * Rebuild the header template if we are connected.
2292 		 * Otherwise clear conn_v6lastdst so we rebuild the header
2293 		 * in the data path.
2294 		 */
2295 		mutex_enter(&connp->conn_lock);
2296 		if (!IN6_IS_ADDR_UNSPECIFIED(&connp->conn_faddr_v6) &&
2297 		    !IN6_IS_ADDR_V4MAPPED_ANY(&connp->conn_faddr_v6)) {
2298 			err = icmp_build_hdr_template(connp,
2299 			    &connp->conn_saddr_v6, &connp->conn_faddr_v6,
2300 			    connp->conn_flowinfo);
2301 			if (err != 0) {
2302 				mutex_exit(&connp->conn_lock);
2303 				return (err);
2304 			}
2305 		} else {
2306 			connp->conn_v6lastdst = ipv6_all_zeros;
2307 		}
2308 		mutex_exit(&connp->conn_lock);
2309 	}
2310 	if (coa->coa_changed & COA_RCVBUF_CHANGED) {
2311 		(void) proto_set_rx_hiwat(connp->conn_rq, connp,
2312 		    connp->conn_rcvbuf);
2313 	}
2314 	if ((coa->coa_changed & COA_SNDBUF_CHANGED) && !IPCL_IS_NONSTR(connp)) {
2315 		connp->conn_wq->q_hiwat = connp->conn_sndbuf;
2316 	}
2317 	if (coa->coa_changed & COA_WROFF_CHANGED) {
2318 		/* Increase wroff if needed */
2319 		uint_t wroff;
2320 
2321 		mutex_enter(&connp->conn_lock);
2322 		wroff = connp->conn_ht_iphc_allocated + is->is_wroff_extra;
2323 		if (wroff > connp->conn_wroff) {
2324 			connp->conn_wroff = wroff;
2325 			mutex_exit(&connp->conn_lock);
2326 			(void) proto_set_tx_wroff(connp->conn_rq, connp, wroff);
2327 		} else {
2328 			mutex_exit(&connp->conn_lock);
2329 		}
2330 	}
2331 	if (coa->coa_changed & COA_ICMP_BIND_NEEDED) {
2332 		icmp_bind_proto(icmp);
2333 	}
2334 	return (err);
2335 }
2336 
2337 /* This routine sets socket options. */
2338 int
2339 icmp_tpi_opt_set(queue_t *q, uint_t optset_context, int level, int name,
2340     uint_t inlen, uchar_t *invalp, uint_t *outlenp, uchar_t *outvalp,
2341     void *thisdg_attrs, cred_t *cr)
2342 {
2343 	conn_t	*connp = Q_TO_CONN(q);
2344 	int error;
2345 
2346 	error = icmp_opt_set(connp, optset_context, level, name, inlen, invalp,
2347 	    outlenp, outvalp, thisdg_attrs, cr);
2348 	return (error);
2349 }
2350 
2351 /*
2352  * Setup IP headers.
2353  *
2354  * Note that IP_HDRINCL has ipha_protocol that is different than conn_proto,
2355  * but icmp_output_hdrincl restores ipha_protocol once we return.
2356  */
2357 mblk_t *
2358 icmp_prepend_hdr(conn_t *connp, ip_xmit_attr_t *ixa, const ip_pkt_t *ipp,
2359     const in6_addr_t *v6src, const in6_addr_t *v6dst, uint32_t flowinfo,
2360     mblk_t *data_mp, int *errorp)
2361 {
2362 	mblk_t		*mp;
2363 	icmp_stack_t	*is = connp->conn_netstack->netstack_icmp;
2364 	uint_t		data_len;
2365 	uint32_t	cksum;
2366 
2367 	data_len = msgdsize(data_mp);
2368 	mp = conn_prepend_hdr(ixa, ipp, v6src, v6dst, connp->conn_proto,
2369 	    flowinfo, 0, data_mp, data_len, is->is_wroff_extra, &cksum, errorp);
2370 	if (mp == NULL) {
2371 		ASSERT(*errorp != 0);
2372 		return (NULL);
2373 	}
2374 
2375 	ixa->ixa_pktlen = data_len + ixa->ixa_ip_hdr_length;
2376 
2377 	/*
2378 	 * If there was a routing option/header then conn_prepend_hdr
2379 	 * has massaged it and placed the pseudo-header checksum difference
2380 	 * in the cksum argument.
2381 	 *
2382 	 * Prepare for ICMPv6 checksum done in IP.
2383 	 *
2384 	 * We make it easy for IP to include our pseudo header
2385 	 * by putting our length (and any routing header adjustment)
2386 	 * in the ICMPv6 checksum field.
2387 	 * The IP source, destination, and length have already been set by
2388 	 * conn_prepend_hdr.
2389 	 */
2390 	cksum += data_len;
2391 	cksum = (cksum >> 16) + (cksum & 0xFFFF);
2392 	ASSERT(cksum < 0x10000);
2393 
2394 	if (ixa->ixa_flags & IXAF_IS_IPV4) {
2395 		ipha_t	*ipha = (ipha_t *)mp->b_rptr;
2396 
2397 		ASSERT(ntohs(ipha->ipha_length) == ixa->ixa_pktlen);
2398 	} else {
2399 		ip6_t	*ip6h = (ip6_t *)mp->b_rptr;
2400 		uint_t	cksum_offset = 0;
2401 
2402 		ASSERT(ntohs(ip6h->ip6_plen) + IPV6_HDR_LEN == ixa->ixa_pktlen);
2403 
2404 		if (ixa->ixa_flags & IXAF_SET_ULP_CKSUM) {
2405 			if (connp->conn_proto == IPPROTO_ICMPV6) {
2406 				cksum_offset = ixa->ixa_ip_hdr_length +
2407 				    offsetof(icmp6_t, icmp6_cksum);
2408 			} else if (ixa->ixa_flags & IXAF_SET_RAW_CKSUM) {
2409 				cksum_offset = ixa->ixa_ip_hdr_length +
2410 				    ixa->ixa_raw_cksum_offset;
2411 			}
2412 		}
2413 		if (cksum_offset != 0) {
2414 			uint16_t *ptr;
2415 
2416 			/* Make sure the checksum fits in the first mblk */
2417 			if (cksum_offset + sizeof (short) > MBLKL(mp)) {
2418 				mblk_t *mp1;
2419 
2420 				mp1 = msgpullup(mp,
2421 				    cksum_offset + sizeof (short));
2422 				freemsg(mp);
2423 				if (mp1 == NULL) {
2424 					*errorp = ENOMEM;
2425 					return (NULL);
2426 				}
2427 				mp = mp1;
2428 				ip6h = (ip6_t *)mp->b_rptr;
2429 			}
2430 			ptr = (uint16_t *)(mp->b_rptr + cksum_offset);
2431 			*ptr = htons(cksum);
2432 		}
2433 	}
2434 
2435 	/* Note that we don't try to update wroff due to ancillary data */
2436 	return (mp);
2437 }
2438 
2439 static int
2440 icmp_build_hdr_template(conn_t *connp, const in6_addr_t *v6src,
2441     const in6_addr_t *v6dst, uint32_t flowinfo)
2442 {
2443 	int		error;
2444 
2445 	ASSERT(MUTEX_HELD(&connp->conn_lock));
2446 	/*
2447 	 * We clear lastdst to make sure we don't use the lastdst path
2448 	 * next time sending since we might not have set v6dst yet.
2449 	 */
2450 	connp->conn_v6lastdst = ipv6_all_zeros;
2451 
2452 	error = conn_build_hdr_template(connp, 0, 0, v6src, v6dst, flowinfo);
2453 	if (error != 0)
2454 		return (error);
2455 
2456 	/*
2457 	 * Any routing header/option has been massaged. The checksum difference
2458 	 * is stored in conn_sum.
2459 	 */
2460 	return (0);
2461 }
2462 
2463 static mblk_t *
2464 icmp_queue_fallback(icmp_t *icmp, mblk_t *mp)
2465 {
2466 	ASSERT(MUTEX_HELD(&icmp->icmp_recv_lock));
2467 	if (IPCL_IS_NONSTR(icmp->icmp_connp)) {
2468 		/*
2469 		 * fallback has started but messages have not been moved yet
2470 		 */
2471 		if (icmp->icmp_fallback_queue_head == NULL) {
2472 			ASSERT(icmp->icmp_fallback_queue_tail == NULL);
2473 			icmp->icmp_fallback_queue_head = mp;
2474 			icmp->icmp_fallback_queue_tail = mp;
2475 		} else {
2476 			ASSERT(icmp->icmp_fallback_queue_tail != NULL);
2477 			icmp->icmp_fallback_queue_tail->b_next = mp;
2478 			icmp->icmp_fallback_queue_tail = mp;
2479 		}
2480 		return (NULL);
2481 	} else {
2482 		/*
2483 		 * Fallback completed, let the caller putnext() the mblk.
2484 		 */
2485 		return (mp);
2486 	}
2487 }
2488 
2489 /*
2490  * Deliver data to ULP. In case we have a socket, and it's falling back to
2491  * TPI, then we'll queue the mp for later processing.
2492  */
2493 static void
2494 icmp_ulp_recv(conn_t *connp, mblk_t *mp, uint_t len)
2495 {
2496 	if (IPCL_IS_NONSTR(connp)) {
2497 		icmp_t *icmp = connp->conn_icmp;
2498 		int error;
2499 
2500 		ASSERT(len == msgdsize(mp));
2501 		if ((*connp->conn_upcalls->su_recv)
2502 		    (connp->conn_upper_handle, mp, len, 0, &error, NULL) < 0) {
2503 			mutex_enter(&icmp->icmp_recv_lock);
2504 			if (error == ENOSPC) {
2505 				/*
2506 				 * let's confirm while holding the lock
2507 				 */
2508 				if ((*connp->conn_upcalls->su_recv)
2509 				    (connp->conn_upper_handle, NULL, 0, 0,
2510 				    &error, NULL) < 0) {
2511 					ASSERT(error == ENOSPC);
2512 					if (error == ENOSPC) {
2513 						connp->conn_flow_cntrld =
2514 						    B_TRUE;
2515 					}
2516 				}
2517 				mutex_exit(&icmp->icmp_recv_lock);
2518 			} else {
2519 				ASSERT(error == EOPNOTSUPP);
2520 				mp = icmp_queue_fallback(icmp, mp);
2521 				mutex_exit(&icmp->icmp_recv_lock);
2522 				if (mp != NULL)
2523 					putnext(connp->conn_rq, mp);
2524 			}
2525 		}
2526 		ASSERT(MUTEX_NOT_HELD(&icmp->icmp_recv_lock));
2527 	} else {
2528 		putnext(connp->conn_rq, mp);
2529 	}
2530 }
2531 
2532 /*
2533  * This is the inbound data path.
2534  * IP has already pulled up the IP headers and verified alignment
2535  * etc.
2536  */
2537 /* ARGSUSED2 */
2538 static void
2539 icmp_input(void *arg1, mblk_t *mp, void *arg2, ip_recv_attr_t *ira)
2540 {
2541 	conn_t			*connp = (conn_t *)arg1;
2542 	struct T_unitdata_ind	*tudi;
2543 	uchar_t			*rptr;		/* Pointer to IP header */
2544 	int			ip_hdr_length;
2545 	int			udi_size;	/* Size of T_unitdata_ind */
2546 	int			pkt_len;
2547 	icmp_t			*icmp;
2548 	ip_pkt_t		ipps;
2549 	ip6_t			*ip6h;
2550 	mblk_t			*mp1;
2551 	crb_t			recv_ancillary;
2552 	icmp_stack_t		*is;
2553 	sin_t			*sin;
2554 	sin6_t			*sin6;
2555 	ipha_t			*ipha;
2556 
2557 	ASSERT(connp->conn_flags & IPCL_RAWIPCONN);
2558 
2559 	icmp = connp->conn_icmp;
2560 	is = icmp->icmp_is;
2561 	rptr = mp->b_rptr;
2562 
2563 	ASSERT(DB_TYPE(mp) == M_DATA);
2564 	ASSERT(OK_32PTR(rptr));
2565 	ASSERT(ira->ira_pktlen == msgdsize(mp));
2566 	pkt_len = ira->ira_pktlen;
2567 
2568 	/*
2569 	 * Get a snapshot of these and allow other threads to change
2570 	 * them after that. We need the same recv_ancillary when determining
2571 	 * the size as when adding the ancillary data items.
2572 	 */
2573 	mutex_enter(&connp->conn_lock);
2574 	recv_ancillary = connp->conn_recv_ancillary;
2575 	mutex_exit(&connp->conn_lock);
2576 
2577 	ip_hdr_length = ira->ira_ip_hdr_length;
2578 	ASSERT(MBLKL(mp) >= ip_hdr_length);	/* IP did a pullup */
2579 
2580 	/* Initialize regardless of IP version */
2581 	ipps.ipp_fields = 0;
2582 
2583 	if (ira->ira_flags & IRAF_IS_IPV4) {
2584 		ASSERT(IPH_HDR_VERSION(rptr) == IPV4_VERSION);
2585 		ASSERT(MBLKL(mp) >= sizeof (ipha_t));
2586 		ASSERT(ira->ira_ip_hdr_length == IPH_HDR_LENGTH(rptr));
2587 
2588 		ipha = (ipha_t *)mp->b_rptr;
2589 		if (recv_ancillary.crb_all != 0)
2590 			(void) ip_find_hdr_v4(ipha, &ipps, B_FALSE);
2591 
2592 		/*
2593 		 * BSD for some reason adjusts ipha_length to exclude the
2594 		 * IP header length. We do the same.
2595 		 */
2596 		if (is->is_bsd_compat) {
2597 			ushort_t len;
2598 
2599 			len = ntohs(ipha->ipha_length);
2600 			if (mp->b_datap->db_ref > 1) {
2601 				/*
2602 				 * Allocate a new IP header so that we can
2603 				 * modify ipha_length.
2604 				 */
2605 				mblk_t	*mp1;
2606 
2607 				mp1 = allocb(ip_hdr_length, BPRI_MED);
2608 				if (mp1 == NULL) {
2609 					freemsg(mp);
2610 					BUMP_MIB(&is->is_rawip_mib,
2611 					    rawipInErrors);
2612 					return;
2613 				}
2614 				bcopy(rptr, mp1->b_rptr, ip_hdr_length);
2615 				mp->b_rptr = rptr + ip_hdr_length;
2616 				rptr = mp1->b_rptr;
2617 				ipha = (ipha_t *)rptr;
2618 				mp1->b_cont = mp;
2619 				mp1->b_wptr = rptr + ip_hdr_length;
2620 				mp = mp1;
2621 			}
2622 			len -= ip_hdr_length;
2623 			ipha->ipha_length = htons(len);
2624 		}
2625 
2626 		/*
2627 		 * For RAW sockets we not pass ICMP/IPv4 packets to AF_INET6
2628 		 * sockets. This is ensured by icmp_bind and the IP fanout code.
2629 		 */
2630 		ASSERT(connp->conn_family == AF_INET);
2631 
2632 		/*
2633 		 * This is the inbound data path.  Packets are passed upstream
2634 		 * as T_UNITDATA_IND messages with full IPv4 headers still
2635 		 * attached.
2636 		 */
2637 
2638 		/*
2639 		 * Normally only send up the source address.
2640 		 * If any ancillary data items are wanted we add those.
2641 		 */
2642 		udi_size = sizeof (struct T_unitdata_ind) + sizeof (sin_t);
2643 		if (recv_ancillary.crb_all != 0) {
2644 			udi_size += conn_recvancillary_size(connp,
2645 			    recv_ancillary, ira, mp, &ipps);
2646 		}
2647 
2648 		/* Allocate a message block for the T_UNITDATA_IND structure. */
2649 		mp1 = allocb(udi_size, BPRI_MED);
2650 		if (mp1 == NULL) {
2651 			freemsg(mp);
2652 			BUMP_MIB(&is->is_rawip_mib, rawipInErrors);
2653 			return;
2654 		}
2655 		mp1->b_cont = mp;
2656 		tudi = (struct T_unitdata_ind *)mp1->b_rptr;
2657 		mp1->b_datap->db_type = M_PROTO;
2658 		mp1->b_wptr = (uchar_t *)tudi + udi_size;
2659 		tudi->PRIM_type = T_UNITDATA_IND;
2660 		tudi->SRC_length = sizeof (sin_t);
2661 		tudi->SRC_offset = sizeof (struct T_unitdata_ind);
2662 		sin = (sin_t *)&tudi[1];
2663 		*sin = sin_null;
2664 		sin->sin_family = AF_INET;
2665 		sin->sin_addr.s_addr = ipha->ipha_src;
2666 		*(uint32_t *)&sin->sin_zero[0] = 0;
2667 		*(uint32_t *)&sin->sin_zero[4] = 0;
2668 		tudi->OPT_offset =  sizeof (struct T_unitdata_ind) +
2669 		    sizeof (sin_t);
2670 		udi_size -= (sizeof (struct T_unitdata_ind) + sizeof (sin_t));
2671 		tudi->OPT_length = udi_size;
2672 
2673 		/*
2674 		 * Add options if IP_RECVIF etc is set
2675 		 */
2676 		if (udi_size != 0) {
2677 			conn_recvancillary_add(connp, recv_ancillary, ira,
2678 			    &ipps, (uchar_t *)&sin[1], udi_size);
2679 		}
2680 		goto deliver;
2681 	}
2682 
2683 	ASSERT(IPH_HDR_VERSION(rptr) == IPV6_VERSION);
2684 	/*
2685 	 * IPv6 packets can only be received by applications
2686 	 * that are prepared to receive IPv6 addresses.
2687 	 * The IP fanout must ensure this.
2688 	 */
2689 	ASSERT(connp->conn_family == AF_INET6);
2690 
2691 	/*
2692 	 * Handle IPv6 packets. We don't pass up the IP headers with the
2693 	 * payload for IPv6.
2694 	 */
2695 
2696 	ip6h = (ip6_t *)rptr;
2697 	if (recv_ancillary.crb_all != 0) {
2698 		/*
2699 		 * Call on ip_find_hdr_v6 which gets individual lenghts of
2700 		 * extension headers (and pointers to them).
2701 		 */
2702 		uint8_t		nexthdr;
2703 
2704 		/* We don't care about the length or nextheader. */
2705 		(void) ip_find_hdr_v6(mp, ip6h, B_TRUE, &ipps, &nexthdr);
2706 
2707 		/*
2708 		 * We do not pass up hop-by-hop options or any other
2709 		 * extension header as part of the packet. Applications
2710 		 * that want to see them have to specify IPV6_RECV* socket
2711 		 * options. And conn_recvancillary_size/add explicitly
2712 		 * drops the TX option from IPV6_HOPOPTS as it does for UDP.
2713 		 *
2714 		 * If we had multilevel ICMP sockets, then we'd want to
2715 		 * modify conn_recvancillary_size/add to
2716 		 * allow the user to see the label.
2717 		 */
2718 	}
2719 
2720 	/*
2721 	 * Check a filter for ICMPv6 types if needed.
2722 	 * Verify raw checksums if needed.
2723 	 */
2724 	mutex_enter(&connp->conn_lock);
2725 	if (icmp->icmp_filter != NULL) {
2726 		int type;
2727 
2728 		/* Assumes that IP has done the pullupmsg */
2729 		type = mp->b_rptr[ip_hdr_length];
2730 
2731 		ASSERT(mp->b_rptr + ip_hdr_length <= mp->b_wptr);
2732 		if (ICMP6_FILTER_WILLBLOCK(type, icmp->icmp_filter)) {
2733 			mutex_exit(&connp->conn_lock);
2734 			freemsg(mp);
2735 			return;
2736 		}
2737 	}
2738 	if (connp->conn_ixa->ixa_flags & IXAF_SET_RAW_CKSUM) {
2739 		/* Checksum */
2740 		uint16_t	*up;
2741 		uint32_t	sum;
2742 		int		remlen;
2743 
2744 		up = (uint16_t *)&ip6h->ip6_src;
2745 
2746 		remlen = msgdsize(mp) - ip_hdr_length;
2747 		sum = htons(connp->conn_proto + remlen)
2748 		    + up[0] + up[1] + up[2] + up[3]
2749 		    + up[4] + up[5] + up[6] + up[7]
2750 		    + up[8] + up[9] + up[10] + up[11]
2751 		    + up[12] + up[13] + up[14] + up[15];
2752 		sum = (sum & 0xffff) + (sum >> 16);
2753 		sum = IP_CSUM(mp, ip_hdr_length, sum);
2754 		if (sum != 0) {
2755 			/* IPv6 RAW checksum failed */
2756 			ip0dbg(("icmp_rput: RAW checksum failed %x\n", sum));
2757 			mutex_exit(&connp->conn_lock);
2758 			freemsg(mp);
2759 			BUMP_MIB(&is->is_rawip_mib, rawipInCksumErrs);
2760 			return;
2761 		}
2762 	}
2763 	mutex_exit(&connp->conn_lock);
2764 
2765 	udi_size = sizeof (struct T_unitdata_ind) + sizeof (sin6_t);
2766 
2767 	if (recv_ancillary.crb_all != 0) {
2768 		udi_size += conn_recvancillary_size(connp,
2769 		    recv_ancillary, ira, mp, &ipps);
2770 	}
2771 
2772 	mp1 = allocb(udi_size, BPRI_MED);
2773 	if (mp1 == NULL) {
2774 		freemsg(mp);
2775 		BUMP_MIB(&is->is_rawip_mib, rawipInErrors);
2776 		return;
2777 	}
2778 	mp1->b_cont = mp;
2779 	mp1->b_datap->db_type = M_PROTO;
2780 	tudi = (struct T_unitdata_ind *)mp1->b_rptr;
2781 	mp1->b_wptr = (uchar_t *)tudi + udi_size;
2782 	tudi->PRIM_type = T_UNITDATA_IND;
2783 	tudi->SRC_length = sizeof (sin6_t);
2784 	tudi->SRC_offset = sizeof (struct T_unitdata_ind);
2785 	tudi->OPT_offset = sizeof (struct T_unitdata_ind) + sizeof (sin6_t);
2786 	udi_size -= (sizeof (struct T_unitdata_ind) + sizeof (sin6_t));
2787 	tudi->OPT_length = udi_size;
2788 	sin6 = (sin6_t *)&tudi[1];
2789 	*sin6 = sin6_null;
2790 	sin6->sin6_port = 0;
2791 	sin6->sin6_family = AF_INET6;
2792 
2793 	sin6->sin6_addr = ip6h->ip6_src;
2794 	/* No sin6_flowinfo per API */
2795 	sin6->sin6_flowinfo = 0;
2796 	/* For link-scope pass up scope id */
2797 	if (IN6_IS_ADDR_LINKSCOPE(&ip6h->ip6_src))
2798 		sin6->sin6_scope_id = ira->ira_ruifindex;
2799 	else
2800 		sin6->sin6_scope_id = 0;
2801 	sin6->__sin6_src_id = ip_srcid_find_addr(&ip6h->ip6_dst,
2802 	    IPCL_ZONEID(connp), is->is_netstack);
2803 
2804 	if (udi_size != 0) {
2805 		conn_recvancillary_add(connp, recv_ancillary, ira,
2806 		    &ipps, (uchar_t *)&sin6[1], udi_size);
2807 	}
2808 
2809 	/* Skip all the IPv6 headers per API */
2810 	mp->b_rptr += ip_hdr_length;
2811 	pkt_len -= ip_hdr_length;
2812 
2813 deliver:
2814 	BUMP_MIB(&is->is_rawip_mib, rawipInDatagrams);
2815 	icmp_ulp_recv(connp, mp1, pkt_len);
2816 }
2817 
2818 /*
2819  * return SNMP stuff in buffer in mpdata. We don't hold any lock and report
2820  * information that can be changing beneath us.
2821  */
2822 mblk_t *
2823 icmp_snmp_get(queue_t *q, mblk_t *mpctl)
2824 {
2825 	mblk_t			*mpdata;
2826 	struct opthdr		*optp;
2827 	conn_t			*connp = Q_TO_CONN(q);
2828 	icmp_stack_t		*is = connp->conn_netstack->netstack_icmp;
2829 	mblk_t			*mp2ctl;
2830 
2831 	/*
2832 	 * make a copy of the original message
2833 	 */
2834 	mp2ctl = copymsg(mpctl);
2835 
2836 	if (mpctl == NULL ||
2837 	    (mpdata = mpctl->b_cont) == NULL) {
2838 		freemsg(mpctl);
2839 		freemsg(mp2ctl);
2840 		return (0);
2841 	}
2842 
2843 	/* fixed length structure for IPv4 and IPv6 counters */
2844 	optp = (struct opthdr *)&mpctl->b_rptr[sizeof (struct T_optmgmt_ack)];
2845 	optp->level = EXPER_RAWIP;
2846 	optp->name = 0;
2847 	(void) snmp_append_data(mpdata, (char *)&is->is_rawip_mib,
2848 	    sizeof (is->is_rawip_mib));
2849 	optp->len = msgdsize(mpdata);
2850 	qreply(q, mpctl);
2851 
2852 	return (mp2ctl);
2853 }
2854 
2855 /*
2856  * Return 0 if invalid set request, 1 otherwise, including non-rawip requests.
2857  * TODO:  If this ever actually tries to set anything, it needs to be
2858  * to do the appropriate locking.
2859  */
2860 /* ARGSUSED */
2861 int
2862 icmp_snmp_set(queue_t *q, t_scalar_t level, t_scalar_t name,
2863     uchar_t *ptr, int len)
2864 {
2865 	switch (level) {
2866 	case EXPER_RAWIP:
2867 		return (0);
2868 	default:
2869 		return (1);
2870 	}
2871 }
2872 
2873 /*
2874  * This routine creates a T_UDERROR_IND message and passes it upstream.
2875  * The address and options are copied from the T_UNITDATA_REQ message
2876  * passed in mp.  This message is freed.
2877  */
2878 static void
2879 icmp_ud_err(queue_t *q, mblk_t *mp, t_scalar_t err)
2880 {
2881 	struct T_unitdata_req *tudr;
2882 	mblk_t	*mp1;
2883 	uchar_t *destaddr;
2884 	t_scalar_t destlen;
2885 	uchar_t	*optaddr;
2886 	t_scalar_t optlen;
2887 
2888 	if ((mp->b_wptr < mp->b_rptr) ||
2889 	    (MBLKL(mp)) < sizeof (struct T_unitdata_req)) {
2890 		goto done;
2891 	}
2892 	tudr = (struct T_unitdata_req *)mp->b_rptr;
2893 	destaddr = mp->b_rptr + tudr->DEST_offset;
2894 	if (destaddr < mp->b_rptr || destaddr >= mp->b_wptr ||
2895 	    destaddr + tudr->DEST_length < mp->b_rptr ||
2896 	    destaddr + tudr->DEST_length > mp->b_wptr) {
2897 		goto done;
2898 	}
2899 	optaddr = mp->b_rptr + tudr->OPT_offset;
2900 	if (optaddr < mp->b_rptr || optaddr >= mp->b_wptr ||
2901 	    optaddr + tudr->OPT_length < mp->b_rptr ||
2902 	    optaddr + tudr->OPT_length > mp->b_wptr) {
2903 		goto done;
2904 	}
2905 	destlen = tudr->DEST_length;
2906 	optlen = tudr->OPT_length;
2907 
2908 	mp1 = mi_tpi_uderror_ind((char *)destaddr, destlen,
2909 	    (char *)optaddr, optlen, err);
2910 	if (mp1 != NULL)
2911 		qreply(q, mp1);
2912 
2913 done:
2914 	freemsg(mp);
2915 }
2916 
2917 static int
2918 rawip_do_unbind(conn_t *connp)
2919 {
2920 	icmp_t	*icmp = connp->conn_icmp;
2921 
2922 	mutex_enter(&connp->conn_lock);
2923 	/* If a bind has not been done, we can't unbind. */
2924 	if (icmp->icmp_state == TS_UNBND) {
2925 		mutex_exit(&connp->conn_lock);
2926 		return (-TOUTSTATE);
2927 	}
2928 	connp->conn_saddr_v6 = ipv6_all_zeros;
2929 	connp->conn_bound_addr_v6 = ipv6_all_zeros;
2930 	connp->conn_laddr_v6 = ipv6_all_zeros;
2931 	connp->conn_mcbc_bind = B_FALSE;
2932 	connp->conn_lport = 0;
2933 	connp->conn_fport = 0;
2934 	/* In case we were also connected */
2935 	connp->conn_faddr_v6 = ipv6_all_zeros;
2936 	connp->conn_v6lastdst = ipv6_all_zeros;
2937 
2938 	icmp->icmp_state = TS_UNBND;
2939 
2940 	(void) icmp_build_hdr_template(connp, &connp->conn_saddr_v6,
2941 	    &connp->conn_faddr_v6, connp->conn_flowinfo);
2942 	mutex_exit(&connp->conn_lock);
2943 
2944 	ip_unbind(connp);
2945 	return (0);
2946 }
2947 
2948 /*
2949  * This routine is called by icmp_wput to handle T_UNBIND_REQ messages.
2950  * After some error checking, the message is passed downstream to ip.
2951  */
2952 static void
2953 icmp_tpi_unbind(queue_t *q, mblk_t *mp)
2954 {
2955 	conn_t	*connp = Q_TO_CONN(q);
2956 	int	error;
2957 
2958 	ASSERT(mp->b_cont == NULL);
2959 	error = rawip_do_unbind(connp);
2960 	if (error) {
2961 		if (error < 0) {
2962 			icmp_err_ack(q, mp, -error, 0);
2963 		} else {
2964 			icmp_err_ack(q, mp, 0, error);
2965 		}
2966 		return;
2967 	}
2968 
2969 	/*
2970 	 * Convert mp into a T_OK_ACK
2971 	 */
2972 
2973 	mp = mi_tpi_ok_ack_alloc(mp);
2974 
2975 	/*
2976 	 * should not happen in practice... T_OK_ACK is smaller than the
2977 	 * original message.
2978 	 */
2979 	ASSERT(mp != NULL);
2980 	ASSERT(((struct T_ok_ack *)mp->b_rptr)->PRIM_type == T_OK_ACK);
2981 	qreply(q, mp);
2982 }
2983 
2984 /*
2985  * Process IPv4 packets that already include an IP header.
2986  * Used when IP_HDRINCL has been set (implicit for IPPROTO_RAW and
2987  * IPPROTO_IGMP).
2988  * In this case we ignore the address and any options in the T_UNITDATA_REQ.
2989  *
2990  * The packet is assumed to have a base (20 byte) IP header followed
2991  * by the upper-layer protocol. We include any IP_OPTIONS including a
2992  * CIPSO label but otherwise preserve the base IP header.
2993  */
2994 static int
2995 icmp_output_hdrincl(conn_t *connp, mblk_t *mp, cred_t *cr, pid_t pid)
2996 {
2997 	icmp_t		*icmp = connp->conn_icmp;
2998 	icmp_stack_t	*is = icmp->icmp_is;
2999 	ipha_t		iphas;
3000 	ipha_t		*ipha;
3001 	int		ip_hdr_length;
3002 	int		tp_hdr_len;
3003 	ip_xmit_attr_t	*ixa;
3004 	ip_pkt_t	*ipp;
3005 	in6_addr_t	v6src;
3006 	in6_addr_t	v6dst;
3007 	in6_addr_t	v6nexthop;
3008 	int		error;
3009 	boolean_t	do_ipsec;
3010 
3011 	/*
3012 	 * We need an exclusive copy of conn_ixa since the included IP
3013 	 * header could have any destination.
3014 	 * That copy has no pointers hence we
3015 	 * need to set them up once we've parsed the ancillary data.
3016 	 */
3017 	ixa = conn_get_ixa_exclusive(connp);
3018 	if (ixa == NULL) {
3019 		BUMP_MIB(&is->is_rawip_mib, rawipOutErrors);
3020 		freemsg(mp);
3021 		return (ENOMEM);
3022 	}
3023 	ASSERT(cr != NULL);
3024 	/*
3025 	 * Caller has a reference on cr; from db_credp or because we
3026 	 * are running in process context.
3027 	 */
3028 	ASSERT(!(ixa->ixa_free_flags & IXA_FREE_CRED));
3029 	ixa->ixa_cred = cr;
3030 	ixa->ixa_cpid = pid;
3031 	if (is_system_labeled()) {
3032 		/* We need to restart with a label based on the cred */
3033 		ip_xmit_attr_restore_tsl(ixa, ixa->ixa_cred);
3034 	}
3035 
3036 	/* In case previous destination was multicast or multirt */
3037 	ip_attr_newdst(ixa);
3038 
3039 	/* Get a copy of conn_xmit_ipp since the TX label might change it */
3040 	ipp = kmem_zalloc(sizeof (*ipp), KM_NOSLEEP);
3041 	if (ipp == NULL) {
3042 		ASSERT(!(ixa->ixa_free_flags & IXA_FREE_CRED));
3043 		ixa->ixa_cred = connp->conn_cred;	/* Restore */
3044 		ixa->ixa_cpid = connp->conn_cpid;
3045 		ixa_refrele(ixa);
3046 		BUMP_MIB(&is->is_rawip_mib, rawipOutErrors);
3047 		freemsg(mp);
3048 		return (ENOMEM);
3049 	}
3050 	mutex_enter(&connp->conn_lock);
3051 	error = ip_pkt_copy(&connp->conn_xmit_ipp, ipp, KM_NOSLEEP);
3052 	mutex_exit(&connp->conn_lock);
3053 	if (error != 0) {
3054 		BUMP_MIB(&is->is_rawip_mib, rawipOutErrors);
3055 		freemsg(mp);
3056 		goto done;
3057 	}
3058 
3059 	/* Sanity check length of packet */
3060 	ipha = (ipha_t *)mp->b_rptr;
3061 
3062 	ip_hdr_length = IP_SIMPLE_HDR_LENGTH;
3063 	if ((mp->b_wptr - mp->b_rptr) < IP_SIMPLE_HDR_LENGTH) {
3064 		if (!pullupmsg(mp, IP_SIMPLE_HDR_LENGTH)) {
3065 			BUMP_MIB(&is->is_rawip_mib, rawipOutErrors);
3066 			freemsg(mp);
3067 			goto done;
3068 		}
3069 		ipha = (ipha_t *)mp->b_rptr;
3070 	}
3071 	ipha->ipha_version_and_hdr_length =
3072 	    (IP_VERSION<<4) | (ip_hdr_length>>2);
3073 
3074 	/*
3075 	 * We set IXAF_DONTFRAG if the application set DF which makes
3076 	 * IP not fragment.
3077 	 */
3078 	ipha->ipha_fragment_offset_and_flags &= htons(IPH_DF);
3079 	if (ipha->ipha_fragment_offset_and_flags & htons(IPH_DF))
3080 		ixa->ixa_flags |= (IXAF_DONTFRAG | IXAF_PMTU_IPV4_DF);
3081 	else
3082 		ixa->ixa_flags &= ~(IXAF_DONTFRAG | IXAF_PMTU_IPV4_DF);
3083 
3084 	/* Even for multicast and broadcast we honor the apps ttl */
3085 	ixa->ixa_flags |= IXAF_NO_TTL_CHANGE;
3086 
3087 	/*
3088 	 * No source verification for non-local addresses
3089 	 */
3090 	if (ipha->ipha_src != INADDR_ANY &&
3091 	    ip_laddr_verify_v4(ipha->ipha_src, ixa->ixa_zoneid,
3092 	    is->is_netstack->netstack_ip, B_FALSE)
3093 	    != IPVL_UNICAST_UP) {
3094 		ixa->ixa_flags &= ~IXAF_VERIFY_SOURCE;
3095 	}
3096 
3097 	if (ipha->ipha_dst == INADDR_ANY)
3098 		ipha->ipha_dst = htonl(INADDR_LOOPBACK);
3099 
3100 	IN6_IPADDR_TO_V4MAPPED(ipha->ipha_src, &v6src);
3101 	IN6_IPADDR_TO_V4MAPPED(ipha->ipha_dst, &v6dst);
3102 
3103 	/* Defer IPsec if it might need to look at ICMP type/code */
3104 	do_ipsec = ipha->ipha_protocol != IPPROTO_ICMP;
3105 	ixa->ixa_flags |= IXAF_IS_IPV4;
3106 
3107 	ip_attr_nexthop(ipp, ixa, &v6dst, &v6nexthop);
3108 	error = ip_attr_connect(connp, ixa, &v6src, &v6dst, &v6nexthop,
3109 	    connp->conn_fport, &v6src, NULL, IPDF_ALLOW_MCBC | IPDF_VERIFY_DST |
3110 	    (do_ipsec ? IPDF_IPSEC : 0));
3111 	switch (error) {
3112 	case 0:
3113 		break;
3114 	case EADDRNOTAVAIL:
3115 		/*
3116 		 * IXAF_VERIFY_SOURCE tells us to pick a better source.
3117 		 * Don't have the application see that errno
3118 		 */
3119 		error = ENETUNREACH;
3120 		goto failed;
3121 	case ENETDOWN:
3122 		/*
3123 		 * Have !ipif_addr_ready address; drop packet silently
3124 		 * until we can get applications to not send until we
3125 		 * are ready.
3126 		 */
3127 		error = 0;
3128 		goto failed;
3129 	case EHOSTUNREACH:
3130 	case ENETUNREACH:
3131 		if (ixa->ixa_ire != NULL) {
3132 			/*
3133 			 * Let conn_ip_output/ire_send_noroute return
3134 			 * the error and send any local ICMP error.
3135 			 */
3136 			error = 0;
3137 			break;
3138 		}
3139 		/* FALLTHRU */
3140 	default:
3141 	failed:
3142 		freemsg(mp);
3143 		BUMP_MIB(&is->is_rawip_mib, rawipOutErrors);
3144 		goto done;
3145 	}
3146 	if (ipha->ipha_src == INADDR_ANY)
3147 		IN6_V4MAPPED_TO_IPADDR(&v6src, ipha->ipha_src);
3148 
3149 	/*
3150 	 * We might be going to a different destination than last time,
3151 	 * thus check that TX allows the communication and compute any
3152 	 * needed label.
3153 	 *
3154 	 * TSOL Note: We have an exclusive ipp and ixa for this thread so we
3155 	 * don't have to worry about concurrent threads.
3156 	 */
3157 	if (is_system_labeled()) {
3158 		/*
3159 		 * Check whether Trusted Solaris policy allows communication
3160 		 * with this host, and pretend that the destination is
3161 		 * unreachable if not.
3162 		 * Compute any needed label and place it in ipp_label_v4/v6.
3163 		 *
3164 		 * Later conn_build_hdr_template/conn_prepend_hdr takes
3165 		 * ipp_label_v4/v6 to form the packet.
3166 		 *
3167 		 * Tsol note: We have ipp structure local to this thread so
3168 		 * no locking is needed.
3169 		 */
3170 		error = conn_update_label(connp, ixa, &v6dst, ipp);
3171 		if (error != 0) {
3172 			freemsg(mp);
3173 			BUMP_MIB(&is->is_rawip_mib, rawipOutErrors);
3174 			goto done;
3175 		}
3176 	}
3177 
3178 	/*
3179 	 * Save away a copy of the IPv4 header the application passed down
3180 	 * and then prepend an IPv4 header complete with any IP options
3181 	 * including label.
3182 	 * We need a struct copy since icmp_prepend_hdr will reuse the available
3183 	 * space in the mblk.
3184 	 */
3185 	iphas = *ipha;
3186 	mp->b_rptr += IP_SIMPLE_HDR_LENGTH;
3187 
3188 	mp = icmp_prepend_hdr(connp, ixa, ipp, &v6src, &v6dst, 0, mp, &error);
3189 	if (mp == NULL) {
3190 		BUMP_MIB(&is->is_rawip_mib, rawipOutErrors);
3191 		ASSERT(error != 0);
3192 		goto done;
3193 	}
3194 	if (ixa->ixa_pktlen > IP_MAXPACKET) {
3195 		error = EMSGSIZE;
3196 		BUMP_MIB(&is->is_rawip_mib, rawipOutErrors);
3197 		freemsg(mp);
3198 		goto done;
3199 	}
3200 	/* Restore key parts of the header that the application passed down */
3201 	ipha = (ipha_t *)mp->b_rptr;
3202 	ipha->ipha_type_of_service = iphas.ipha_type_of_service;
3203 	ipha->ipha_ident = iphas.ipha_ident;
3204 	ipha->ipha_fragment_offset_and_flags =
3205 	    iphas.ipha_fragment_offset_and_flags;
3206 	ipha->ipha_ttl = iphas.ipha_ttl;
3207 	ipha->ipha_protocol = iphas.ipha_protocol;
3208 	ipha->ipha_src = iphas.ipha_src;
3209 	ipha->ipha_dst = iphas.ipha_dst;
3210 
3211 	ixa->ixa_protocol = ipha->ipha_protocol;
3212 
3213 	/*
3214 	 * Make sure that the IP header plus any transport header that is
3215 	 * checksumed by ip_output is in the first mblk. (ip_output assumes
3216 	 * that at least the checksum field is in the first mblk.)
3217 	 */
3218 	switch (ipha->ipha_protocol) {
3219 	case IPPROTO_UDP:
3220 		tp_hdr_len = 8;
3221 		break;
3222 	case IPPROTO_TCP:
3223 		tp_hdr_len = 20;
3224 		break;
3225 	default:
3226 		tp_hdr_len = 0;
3227 		break;
3228 	}
3229 	ip_hdr_length = IPH_HDR_LENGTH(ipha);
3230 	if (mp->b_wptr - mp->b_rptr < ip_hdr_length + tp_hdr_len) {
3231 		if (!pullupmsg(mp, ip_hdr_length + tp_hdr_len)) {
3232 			BUMP_MIB(&is->is_rawip_mib, rawipOutErrors);
3233 			if (mp->b_cont == NULL)
3234 				error = EINVAL;
3235 			else
3236 				error = ENOMEM;
3237 			freemsg(mp);
3238 			goto done;
3239 		}
3240 	}
3241 
3242 	if (!do_ipsec) {
3243 		/* Policy might differ for different ICMP type/code */
3244 		if (ixa->ixa_ipsec_policy != NULL) {
3245 			IPPOL_REFRELE(ixa->ixa_ipsec_policy);
3246 			ixa->ixa_ipsec_policy = NULL;
3247 			ixa->ixa_flags &= ~IXAF_IPSEC_SECURE;
3248 		}
3249 		mp = ip_output_attach_policy(mp, ipha, NULL, connp, ixa);
3250 		if (mp == NULL) {
3251 			BUMP_MIB(&is->is_rawip_mib, rawipOutErrors);
3252 			error = EHOSTUNREACH;	/* IPsec policy failure */
3253 			goto done;
3254 		}
3255 	}
3256 
3257 	/* We're done.  Pass the packet to ip. */
3258 	BUMP_MIB(&is->is_rawip_mib, rawipOutDatagrams);
3259 
3260 	error = conn_ip_output(mp, ixa);
3261 	/* No rawipOutErrors if an error since IP increases its error counter */
3262 	switch (error) {
3263 	case 0:
3264 		break;
3265 	case EWOULDBLOCK:
3266 		(void) ixa_check_drain_insert(connp, ixa);
3267 		error = 0;
3268 		break;
3269 	case EADDRNOTAVAIL:
3270 		/*
3271 		 * IXAF_VERIFY_SOURCE tells us to pick a better source.
3272 		 * Don't have the application see that errno
3273 		 */
3274 		error = ENETUNREACH;
3275 		break;
3276 	}
3277 done:
3278 	ASSERT(!(ixa->ixa_free_flags & IXA_FREE_CRED));
3279 	ixa->ixa_cred = connp->conn_cred;	/* Restore */
3280 	ixa->ixa_cpid = connp->conn_cpid;
3281 	ixa_refrele(ixa);
3282 	ip_pkt_free(ipp);
3283 	kmem_free(ipp, sizeof (*ipp));
3284 	return (error);
3285 }
3286 
3287 static mblk_t *
3288 icmp_output_attach_policy(mblk_t *mp, conn_t *connp, ip_xmit_attr_t *ixa)
3289 {
3290 	ipha_t	*ipha = NULL;
3291 	ip6_t	*ip6h = NULL;
3292 
3293 	if (ixa->ixa_flags & IXAF_IS_IPV4)
3294 		ipha = (ipha_t *)mp->b_rptr;
3295 	else
3296 		ip6h = (ip6_t *)mp->b_rptr;
3297 
3298 	if (ixa->ixa_ipsec_policy != NULL) {
3299 		IPPOL_REFRELE(ixa->ixa_ipsec_policy);
3300 		ixa->ixa_ipsec_policy = NULL;
3301 		ixa->ixa_flags &= ~IXAF_IPSEC_SECURE;
3302 	}
3303 	return (ip_output_attach_policy(mp, ipha, ip6h, connp, ixa));
3304 }
3305 
3306 /*
3307  * Handle T_UNITDATA_REQ with options. Both IPv4 and IPv6
3308  * Either tudr_mp or msg is set. If tudr_mp we take ancillary data from
3309  * the TPI options, otherwise we take them from msg_control.
3310  * If both sin and sin6 is set it is a connected socket and we use conn_faddr.
3311  * Always consumes mp; never consumes tudr_mp.
3312  */
3313 static int
3314 icmp_output_ancillary(conn_t *connp, sin_t *sin, sin6_t *sin6, mblk_t *mp,
3315     mblk_t *tudr_mp, struct nmsghdr *msg, cred_t *cr, pid_t pid)
3316 {
3317 	icmp_t		*icmp = connp->conn_icmp;
3318 	icmp_stack_t	*is = icmp->icmp_is;
3319 	int		error;
3320 	ip_xmit_attr_t	*ixa;
3321 	ip_pkt_t	*ipp;
3322 	in6_addr_t	v6src;
3323 	in6_addr_t	v6dst;
3324 	in6_addr_t	v6nexthop;
3325 	in_port_t	dstport;
3326 	uint32_t	flowinfo;
3327 	uint_t		srcid;
3328 	int		is_absreq_failure = 0;
3329 	conn_opt_arg_t	coas, *coa;
3330 
3331 	ASSERT(tudr_mp != NULL || msg != NULL);
3332 
3333 	/*
3334 	 * Get ixa before checking state to handle a disconnect race.
3335 	 *
3336 	 * We need an exclusive copy of conn_ixa since the ancillary data
3337 	 * options might modify it. That copy has no pointers hence we
3338 	 * need to set them up once we've parsed the ancillary data.
3339 	 */
3340 	ixa = conn_get_ixa_exclusive(connp);
3341 	if (ixa == NULL) {
3342 		BUMP_MIB(&is->is_rawip_mib, rawipOutErrors);
3343 		freemsg(mp);
3344 		return (ENOMEM);
3345 	}
3346 	ASSERT(cr != NULL);
3347 	ASSERT(!(ixa->ixa_free_flags & IXA_FREE_CRED));
3348 	ixa->ixa_cred = cr;
3349 	ixa->ixa_cpid = pid;
3350 	if (is_system_labeled()) {
3351 		/* We need to restart with a label based on the cred */
3352 		ip_xmit_attr_restore_tsl(ixa, ixa->ixa_cred);
3353 	}
3354 
3355 	/* In case previous destination was multicast or multirt */
3356 	ip_attr_newdst(ixa);
3357 
3358 	/* Get a copy of conn_xmit_ipp since the options might change it */
3359 	ipp = kmem_zalloc(sizeof (*ipp), KM_NOSLEEP);
3360 	if (ipp == NULL) {
3361 		ASSERT(!(ixa->ixa_free_flags & IXA_FREE_CRED));
3362 		ixa->ixa_cred = connp->conn_cred;	/* Restore */
3363 		ixa->ixa_cpid = connp->conn_cpid;
3364 		ixa_refrele(ixa);
3365 		BUMP_MIB(&is->is_rawip_mib, rawipOutErrors);
3366 		freemsg(mp);
3367 		return (ENOMEM);
3368 	}
3369 	mutex_enter(&connp->conn_lock);
3370 	error = ip_pkt_copy(&connp->conn_xmit_ipp, ipp, KM_NOSLEEP);
3371 	mutex_exit(&connp->conn_lock);
3372 	if (error != 0) {
3373 		BUMP_MIB(&is->is_rawip_mib, rawipOutErrors);
3374 		freemsg(mp);
3375 		goto done;
3376 	}
3377 
3378 	/*
3379 	 * Parse the options and update ixa and ipp as a result.
3380 	 */
3381 
3382 	coa = &coas;
3383 	coa->coa_connp = connp;
3384 	coa->coa_ixa = ixa;
3385 	coa->coa_ipp = ipp;
3386 	coa->coa_ancillary = B_TRUE;
3387 	coa->coa_changed = 0;
3388 
3389 	if (msg != NULL) {
3390 		error = process_auxiliary_options(connp, msg->msg_control,
3391 		    msg->msg_controllen, coa, &icmp_opt_obj, icmp_opt_set, cr);
3392 	} else {
3393 		struct T_unitdata_req *tudr;
3394 
3395 		tudr = (struct T_unitdata_req *)tudr_mp->b_rptr;
3396 		ASSERT(tudr->PRIM_type == T_UNITDATA_REQ);
3397 		error = tpi_optcom_buf(connp->conn_wq, tudr_mp,
3398 		    &tudr->OPT_length, tudr->OPT_offset, cr, &icmp_opt_obj,
3399 		    coa, &is_absreq_failure);
3400 	}
3401 	if (error != 0) {
3402 		/*
3403 		 * Note: No special action needed in this
3404 		 * module for "is_absreq_failure"
3405 		 */
3406 		freemsg(mp);
3407 		BUMP_MIB(&is->is_rawip_mib, rawipOutErrors);
3408 		goto done;
3409 	}
3410 	ASSERT(is_absreq_failure == 0);
3411 
3412 	mutex_enter(&connp->conn_lock);
3413 	/*
3414 	 * If laddr is unspecified then we look at sin6_src_id.
3415 	 * We will give precedence to a source address set with IPV6_PKTINFO
3416 	 * (aka IPPF_ADDR) but that is handled in build_hdrs. However, we don't
3417 	 * want ip_attr_connect to select a source (since it can fail) when
3418 	 * IPV6_PKTINFO is specified.
3419 	 * If this doesn't result in a source address then we get a source
3420 	 * from ip_attr_connect() below.
3421 	 */
3422 	v6src = connp->conn_saddr_v6;
3423 	if (sin != NULL) {
3424 		IN6_IPADDR_TO_V4MAPPED(sin->sin_addr.s_addr, &v6dst);
3425 		dstport = sin->sin_port;
3426 		flowinfo = 0;
3427 		ixa->ixa_flags &= ~IXAF_SCOPEID_SET;
3428 		ixa->ixa_flags |= IXAF_IS_IPV4;
3429 	} else if (sin6 != NULL) {
3430 		v6dst = sin6->sin6_addr;
3431 		dstport = sin6->sin6_port;
3432 		flowinfo = sin6->sin6_flowinfo;
3433 		srcid = sin6->__sin6_src_id;
3434 		if (IN6_IS_ADDR_LINKSCOPE(&v6dst) && sin6->sin6_scope_id != 0) {
3435 			ixa->ixa_scopeid = sin6->sin6_scope_id;
3436 			ixa->ixa_flags |= IXAF_SCOPEID_SET;
3437 		} else {
3438 			ixa->ixa_flags &= ~IXAF_SCOPEID_SET;
3439 		}
3440 		if (srcid != 0 && IN6_IS_ADDR_UNSPECIFIED(&v6src)) {
3441 			ip_srcid_find_id(srcid, &v6src, IPCL_ZONEID(connp),
3442 			    connp->conn_netstack);
3443 		}
3444 		if (IN6_IS_ADDR_V4MAPPED(&v6dst))
3445 			ixa->ixa_flags |= IXAF_IS_IPV4;
3446 		else
3447 			ixa->ixa_flags &= ~IXAF_IS_IPV4;
3448 	} else {
3449 		/* Connected case */
3450 		v6dst = connp->conn_faddr_v6;
3451 		flowinfo = connp->conn_flowinfo;
3452 	}
3453 	mutex_exit(&connp->conn_lock);
3454 	/* Handle IPV6_PKTINFO setting source address. */
3455 	if (IN6_IS_ADDR_UNSPECIFIED(&v6src) &&
3456 	    (ipp->ipp_fields & IPPF_ADDR)) {
3457 		if (ixa->ixa_flags & IXAF_IS_IPV4) {
3458 			if (IN6_IS_ADDR_V4MAPPED(&ipp->ipp_addr))
3459 				v6src = ipp->ipp_addr;
3460 		} else {
3461 			if (!IN6_IS_ADDR_V4MAPPED(&ipp->ipp_addr))
3462 				v6src = ipp->ipp_addr;
3463 		}
3464 	}
3465 	/*
3466 	 * Allow source not assigned to the system
3467 	 * only if it is not a local addresses
3468 	 */
3469 	if (!V6_OR_V4_INADDR_ANY(v6src)) {
3470 		ip_laddr_t laddr_type;
3471 
3472 		if (ixa->ixa_flags & IXAF_IS_IPV4) {
3473 			ipaddr_t v4src;
3474 
3475 			IN6_V4MAPPED_TO_IPADDR(&v6src, v4src);
3476 			laddr_type = ip_laddr_verify_v4(v4src, ixa->ixa_zoneid,
3477 			    is->is_netstack->netstack_ip, B_FALSE);
3478 		} else {
3479 			laddr_type = ip_laddr_verify_v6(&v6src, ixa->ixa_zoneid,
3480 			    is->is_netstack->netstack_ip, B_FALSE, B_FALSE);
3481 		}
3482 		if (laddr_type != IPVL_UNICAST_UP)
3483 			ixa->ixa_flags &= ~IXAF_VERIFY_SOURCE;
3484 	}
3485 
3486 	ip_attr_nexthop(ipp, ixa, &v6dst, &v6nexthop);
3487 	error = ip_attr_connect(connp, ixa, &v6src, &v6dst, &v6nexthop, dstport,
3488 	    &v6src, NULL, IPDF_ALLOW_MCBC | IPDF_VERIFY_DST);
3489 
3490 	switch (error) {
3491 	case 0:
3492 		break;
3493 	case EADDRNOTAVAIL:
3494 		/*
3495 		 * IXAF_VERIFY_SOURCE tells us to pick a better source.
3496 		 * Don't have the application see that errno
3497 		 */
3498 		error = ENETUNREACH;
3499 		goto failed;
3500 	case ENETDOWN:
3501 		/*
3502 		 * Have !ipif_addr_ready address; drop packet silently
3503 		 * until we can get applications to not send until we
3504 		 * are ready.
3505 		 */
3506 		error = 0;
3507 		goto failed;
3508 	case EHOSTUNREACH:
3509 	case ENETUNREACH:
3510 		if (ixa->ixa_ire != NULL) {
3511 			/*
3512 			 * Let conn_ip_output/ire_send_noroute return
3513 			 * the error and send any local ICMP error.
3514 			 */
3515 			error = 0;
3516 			break;
3517 		}
3518 		/* FALLTHRU */
3519 	default:
3520 	failed:
3521 		freemsg(mp);
3522 		BUMP_MIB(&is->is_rawip_mib, rawipOutErrors);
3523 		goto done;
3524 	}
3525 
3526 	/*
3527 	 * We might be going to a different destination than last time,
3528 	 * thus check that TX allows the communication and compute any
3529 	 * needed label.
3530 	 *
3531 	 * TSOL Note: We have an exclusive ipp and ixa for this thread so we
3532 	 * don't have to worry about concurrent threads.
3533 	 */
3534 	if (is_system_labeled()) {
3535 		/*
3536 		 * Check whether Trusted Solaris policy allows communication
3537 		 * with this host, and pretend that the destination is
3538 		 * unreachable if not.
3539 		 * Compute any needed label and place it in ipp_label_v4/v6.
3540 		 *
3541 		 * Later conn_build_hdr_template/conn_prepend_hdr takes
3542 		 * ipp_label_v4/v6 to form the packet.
3543 		 *
3544 		 * Tsol note: We have ipp structure local to this thread so
3545 		 * no locking is needed.
3546 		 */
3547 		error = conn_update_label(connp, ixa, &v6dst, ipp);
3548 		if (error != 0) {
3549 			freemsg(mp);
3550 			BUMP_MIB(&is->is_rawip_mib, rawipOutErrors);
3551 			goto done;
3552 		}
3553 	}
3554 	mp = icmp_prepend_hdr(connp, ixa, ipp, &v6src, &v6dst, flowinfo, mp,
3555 	    &error);
3556 	if (mp == NULL) {
3557 		BUMP_MIB(&is->is_rawip_mib, rawipOutErrors);
3558 		ASSERT(error != 0);
3559 		goto done;
3560 	}
3561 	if (ixa->ixa_pktlen > IP_MAXPACKET) {
3562 		error = EMSGSIZE;
3563 		BUMP_MIB(&is->is_rawip_mib, rawipOutErrors);
3564 		freemsg(mp);
3565 		goto done;
3566 	}
3567 
3568 	/* Policy might differ for different ICMP type/code */
3569 	mp = icmp_output_attach_policy(mp, connp, ixa);
3570 	if (mp == NULL) {
3571 		BUMP_MIB(&is->is_rawip_mib, rawipOutErrors);
3572 		error = EHOSTUNREACH;	/* IPsec policy failure */
3573 		goto done;
3574 	}
3575 
3576 	/* We're done.  Pass the packet to ip. */
3577 	BUMP_MIB(&is->is_rawip_mib, rawipOutDatagrams);
3578 
3579 	error = conn_ip_output(mp, ixa);
3580 	if (!connp->conn_unspec_src)
3581 		ixa->ixa_flags |= IXAF_VERIFY_SOURCE;
3582 	/* No rawipOutErrors if an error since IP increases its error counter */
3583 	switch (error) {
3584 	case 0:
3585 		break;
3586 	case EWOULDBLOCK:
3587 		(void) ixa_check_drain_insert(connp, ixa);
3588 		error = 0;
3589 		break;
3590 	case EADDRNOTAVAIL:
3591 		/*
3592 		 * IXAF_VERIFY_SOURCE tells us to pick a better source.
3593 		 * Don't have the application see that errno
3594 		 */
3595 		error = ENETUNREACH;
3596 		/* FALLTHRU */
3597 	default:
3598 		mutex_enter(&connp->conn_lock);
3599 		/*
3600 		 * Clear the source and v6lastdst so we call ip_attr_connect
3601 		 * for the next packet and try to pick a better source.
3602 		 */
3603 		if (connp->conn_mcbc_bind)
3604 			connp->conn_saddr_v6 = ipv6_all_zeros;
3605 		else
3606 			connp->conn_saddr_v6 = connp->conn_bound_addr_v6;
3607 		connp->conn_v6lastdst = ipv6_all_zeros;
3608 		mutex_exit(&connp->conn_lock);
3609 		break;
3610 	}
3611 done:
3612 	ASSERT(!(ixa->ixa_free_flags & IXA_FREE_CRED));
3613 	ixa->ixa_cred = connp->conn_cred;	/* Restore */
3614 	ixa->ixa_cpid = connp->conn_cpid;
3615 	ixa_refrele(ixa);
3616 	ip_pkt_free(ipp);
3617 	kmem_free(ipp, sizeof (*ipp));
3618 	return (error);
3619 }
3620 
3621 /*
3622  * Handle sending an M_DATA for a connected socket.
3623  * Handles both IPv4 and IPv6.
3624  */
3625 int
3626 icmp_output_connected(conn_t *connp, mblk_t *mp, cred_t *cr, pid_t pid)
3627 {
3628 	icmp_t		*icmp = connp->conn_icmp;
3629 	icmp_stack_t	*is = icmp->icmp_is;
3630 	int		error;
3631 	ip_xmit_attr_t	*ixa;
3632 	boolean_t	do_ipsec;
3633 
3634 	/*
3635 	 * If no other thread is using conn_ixa this just gets a reference to
3636 	 * conn_ixa. Otherwise we get a safe copy of conn_ixa.
3637 	 */
3638 	ixa = conn_get_ixa(connp, B_FALSE);
3639 	if (ixa == NULL) {
3640 		BUMP_MIB(&is->is_rawip_mib, rawipOutErrors);
3641 		freemsg(mp);
3642 		return (ENOMEM);
3643 	}
3644 
3645 	ASSERT(cr != NULL);
3646 	ASSERT(!(ixa->ixa_free_flags & IXA_FREE_CRED));
3647 	ixa->ixa_cred = cr;
3648 	ixa->ixa_cpid = pid;
3649 
3650 	/* Defer IPsec if it might need to look at ICMP type/code */
3651 	switch (ixa->ixa_protocol) {
3652 	case IPPROTO_ICMP:
3653 	case IPPROTO_ICMPV6:
3654 		do_ipsec = B_FALSE;
3655 		break;
3656 	default:
3657 		do_ipsec = B_TRUE;
3658 	}
3659 
3660 	mutex_enter(&connp->conn_lock);
3661 	mp = icmp_prepend_header_template(connp, ixa, mp,
3662 	    &connp->conn_saddr_v6, connp->conn_flowinfo, &error);
3663 
3664 	if (mp == NULL) {
3665 		ASSERT(error != 0);
3666 		mutex_exit(&connp->conn_lock);
3667 		ASSERT(!(ixa->ixa_free_flags & IXA_FREE_CRED));
3668 		ixa->ixa_cred = connp->conn_cred;	/* Restore */
3669 		ixa->ixa_cpid = connp->conn_cpid;
3670 		ixa_refrele(ixa);
3671 		BUMP_MIB(&is->is_rawip_mib, rawipOutErrors);
3672 		freemsg(mp);
3673 		return (error);
3674 	}
3675 
3676 	if (!do_ipsec) {
3677 		/* Policy might differ for different ICMP type/code */
3678 		mp = icmp_output_attach_policy(mp, connp, ixa);
3679 		if (mp == NULL) {
3680 			mutex_exit(&connp->conn_lock);
3681 			BUMP_MIB(&is->is_rawip_mib, rawipOutErrors);
3682 			ASSERT(!(ixa->ixa_free_flags & IXA_FREE_CRED));
3683 			ixa->ixa_cred = connp->conn_cred;	/* Restore */
3684 			ixa->ixa_cpid = connp->conn_cpid;
3685 			ixa_refrele(ixa);
3686 			return (EHOSTUNREACH);	/* IPsec policy failure */
3687 		}
3688 	}
3689 
3690 	/*
3691 	 * In case we got a safe copy of conn_ixa, or if opt_set made us a new
3692 	 * safe copy, then we need to fill in any pointers in it.
3693 	 */
3694 	if (ixa->ixa_ire == NULL) {
3695 		in6_addr_t	faddr, saddr;
3696 		in6_addr_t	nexthop;
3697 		in_port_t	fport;
3698 
3699 		saddr = connp->conn_saddr_v6;
3700 		faddr = connp->conn_faddr_v6;
3701 		fport = connp->conn_fport;
3702 		ip_attr_nexthop(&connp->conn_xmit_ipp, ixa, &faddr, &nexthop);
3703 		mutex_exit(&connp->conn_lock);
3704 
3705 		error = ip_attr_connect(connp, ixa, &saddr, &faddr, &nexthop,
3706 		    fport, NULL, NULL, IPDF_ALLOW_MCBC | IPDF_VERIFY_DST |
3707 		    (do_ipsec ? IPDF_IPSEC : 0));
3708 		switch (error) {
3709 		case 0:
3710 			break;
3711 		case EADDRNOTAVAIL:
3712 			/*
3713 			 * IXAF_VERIFY_SOURCE tells us to pick a better source.
3714 			 * Don't have the application see that errno
3715 			 */
3716 			error = ENETUNREACH;
3717 			goto failed;
3718 		case ENETDOWN:
3719 			/*
3720 			 * Have !ipif_addr_ready address; drop packet silently
3721 			 * until we can get applications to not send until we
3722 			 * are ready.
3723 			 */
3724 			error = 0;
3725 			goto failed;
3726 		case EHOSTUNREACH:
3727 		case ENETUNREACH:
3728 			if (ixa->ixa_ire != NULL) {
3729 				/*
3730 				 * Let conn_ip_output/ire_send_noroute return
3731 				 * the error and send any local ICMP error.
3732 				 */
3733 				error = 0;
3734 				break;
3735 			}
3736 			/* FALLTHRU */
3737 		default:
3738 		failed:
3739 			ASSERT(!(ixa->ixa_free_flags & IXA_FREE_CRED));
3740 			ixa->ixa_cred = connp->conn_cred;	/* Restore */
3741 			ixa->ixa_cpid = connp->conn_cpid;
3742 			ixa_refrele(ixa);
3743 			BUMP_MIB(&is->is_rawip_mib, rawipOutErrors);
3744 			freemsg(mp);
3745 			return (error);
3746 		}
3747 	} else {
3748 		/* Done with conn_t */
3749 		mutex_exit(&connp->conn_lock);
3750 	}
3751 
3752 	/* We're done.  Pass the packet to ip. */
3753 	BUMP_MIB(&is->is_rawip_mib, rawipOutDatagrams);
3754 
3755 	error = conn_ip_output(mp, ixa);
3756 	/* No rawipOutErrors if an error since IP increases its error counter */
3757 	switch (error) {
3758 	case 0:
3759 		break;
3760 	case EWOULDBLOCK:
3761 		(void) ixa_check_drain_insert(connp, ixa);
3762 		error = 0;
3763 		break;
3764 	case EADDRNOTAVAIL:
3765 		/*
3766 		 * IXAF_VERIFY_SOURCE tells us to pick a better source.
3767 		 * Don't have the application see that errno
3768 		 */
3769 		error = ENETUNREACH;
3770 		break;
3771 	}
3772 	ASSERT(!(ixa->ixa_free_flags & IXA_FREE_CRED));
3773 	ixa->ixa_cred = connp->conn_cred;	/* Restore */
3774 	ixa->ixa_cpid = connp->conn_cpid;
3775 	ixa_refrele(ixa);
3776 	return (error);
3777 }
3778 
3779 /*
3780  * Handle sending an M_DATA to the last destination.
3781  * Handles both IPv4 and IPv6.
3782  *
3783  * NOTE: The caller must hold conn_lock and we drop it here.
3784  */
3785 int
3786 icmp_output_lastdst(conn_t *connp, mblk_t *mp, cred_t *cr, pid_t pid,
3787     ip_xmit_attr_t *ixa)
3788 {
3789 	icmp_t		*icmp = connp->conn_icmp;
3790 	icmp_stack_t	*is = icmp->icmp_is;
3791 	int		error;
3792 	boolean_t	do_ipsec;
3793 
3794 	ASSERT(MUTEX_HELD(&connp->conn_lock));
3795 	ASSERT(ixa != NULL);
3796 
3797 	ASSERT(cr != NULL);
3798 	ASSERT(!(ixa->ixa_free_flags & IXA_FREE_CRED));
3799 	ixa->ixa_cred = cr;
3800 	ixa->ixa_cpid = pid;
3801 
3802 	/* Defer IPsec if it might need to look at ICMP type/code */
3803 	switch (ixa->ixa_protocol) {
3804 	case IPPROTO_ICMP:
3805 	case IPPROTO_ICMPV6:
3806 		do_ipsec = B_FALSE;
3807 		break;
3808 	default:
3809 		do_ipsec = B_TRUE;
3810 	}
3811 
3812 
3813 	mp = icmp_prepend_header_template(connp, ixa, mp,
3814 	    &connp->conn_v6lastsrc, connp->conn_lastflowinfo, &error);
3815 
3816 	if (mp == NULL) {
3817 		ASSERT(error != 0);
3818 		mutex_exit(&connp->conn_lock);
3819 		ASSERT(!(ixa->ixa_free_flags & IXA_FREE_CRED));
3820 		ixa->ixa_cred = connp->conn_cred;	/* Restore */
3821 		ixa->ixa_cpid = connp->conn_cpid;
3822 		ixa_refrele(ixa);
3823 		BUMP_MIB(&is->is_rawip_mib, rawipOutErrors);
3824 		freemsg(mp);
3825 		return (error);
3826 	}
3827 
3828 	if (!do_ipsec) {
3829 		/* Policy might differ for different ICMP type/code */
3830 		mp = icmp_output_attach_policy(mp, connp, ixa);
3831 		if (mp == NULL) {
3832 			mutex_exit(&connp->conn_lock);
3833 			BUMP_MIB(&is->is_rawip_mib, rawipOutErrors);
3834 			ASSERT(!(ixa->ixa_free_flags & IXA_FREE_CRED));
3835 			ixa->ixa_cred = connp->conn_cred;	/* Restore */
3836 			ixa->ixa_cpid = connp->conn_cpid;
3837 			ixa_refrele(ixa);
3838 			return (EHOSTUNREACH);	/* IPsec policy failure */
3839 		}
3840 	}
3841 
3842 	/*
3843 	 * In case we got a safe copy of conn_ixa, or if opt_set made us a new
3844 	 * safe copy, then we need to fill in any pointers in it.
3845 	 */
3846 	if (ixa->ixa_ire == NULL) {
3847 		in6_addr_t	lastdst, lastsrc;
3848 		in6_addr_t	nexthop;
3849 		in_port_t	lastport;
3850 
3851 		lastsrc = connp->conn_v6lastsrc;
3852 		lastdst = connp->conn_v6lastdst;
3853 		lastport = connp->conn_lastdstport;
3854 		ip_attr_nexthop(&connp->conn_xmit_ipp, ixa, &lastdst, &nexthop);
3855 		mutex_exit(&connp->conn_lock);
3856 
3857 		error = ip_attr_connect(connp, ixa, &lastsrc, &lastdst,
3858 		    &nexthop, lastport, NULL, NULL, IPDF_ALLOW_MCBC |
3859 		    IPDF_VERIFY_DST | (do_ipsec ? IPDF_IPSEC : 0));
3860 		switch (error) {
3861 		case 0:
3862 			break;
3863 		case EADDRNOTAVAIL:
3864 			/*
3865 			 * IXAF_VERIFY_SOURCE tells us to pick a better source.
3866 			 * Don't have the application see that errno
3867 			 */
3868 			error = ENETUNREACH;
3869 			goto failed;
3870 		case ENETDOWN:
3871 			/*
3872 			 * Have !ipif_addr_ready address; drop packet silently
3873 			 * until we can get applications to not send until we
3874 			 * are ready.
3875 			 */
3876 			error = 0;
3877 			goto failed;
3878 		case EHOSTUNREACH:
3879 		case ENETUNREACH:
3880 			if (ixa->ixa_ire != NULL) {
3881 				/*
3882 				 * Let conn_ip_output/ire_send_noroute return
3883 				 * the error and send any local ICMP error.
3884 				 */
3885 				error = 0;
3886 				break;
3887 			}
3888 			/* FALLTHRU */
3889 		default:
3890 		failed:
3891 			ASSERT(!(ixa->ixa_free_flags & IXA_FREE_CRED));
3892 			ixa->ixa_cred = connp->conn_cred;	/* Restore */
3893 			ixa->ixa_cpid = connp->conn_cpid;
3894 			ixa_refrele(ixa);
3895 			BUMP_MIB(&is->is_rawip_mib, rawipOutErrors);
3896 			freemsg(mp);
3897 			return (error);
3898 		}
3899 	} else {
3900 		/* Done with conn_t */
3901 		mutex_exit(&connp->conn_lock);
3902 	}
3903 
3904 	/* We're done.  Pass the packet to ip. */
3905 	BUMP_MIB(&is->is_rawip_mib, rawipOutDatagrams);
3906 	error = conn_ip_output(mp, ixa);
3907 	/* No rawipOutErrors if an error since IP increases its error counter */
3908 	switch (error) {
3909 	case 0:
3910 		break;
3911 	case EWOULDBLOCK:
3912 		(void) ixa_check_drain_insert(connp, ixa);
3913 		error = 0;
3914 		break;
3915 	case EADDRNOTAVAIL:
3916 		/*
3917 		 * IXAF_VERIFY_SOURCE tells us to pick a better source.
3918 		 * Don't have the application see that errno
3919 		 */
3920 		error = ENETUNREACH;
3921 		/* FALLTHRU */
3922 	default:
3923 		mutex_enter(&connp->conn_lock);
3924 		/*
3925 		 * Clear the source and v6lastdst so we call ip_attr_connect
3926 		 * for the next packet and try to pick a better source.
3927 		 */
3928 		if (connp->conn_mcbc_bind)
3929 			connp->conn_saddr_v6 = ipv6_all_zeros;
3930 		else
3931 			connp->conn_saddr_v6 = connp->conn_bound_addr_v6;
3932 		connp->conn_v6lastdst = ipv6_all_zeros;
3933 		mutex_exit(&connp->conn_lock);
3934 		break;
3935 	}
3936 	ASSERT(!(ixa->ixa_free_flags & IXA_FREE_CRED));
3937 	ixa->ixa_cred = connp->conn_cred;	/* Restore */
3938 	ixa->ixa_cpid = connp->conn_cpid;
3939 	ixa_refrele(ixa);
3940 	return (error);
3941 }
3942 
3943 
3944 /*
3945  * Prepend the header template and then fill in the source and
3946  * flowinfo. The caller needs to handle the destination address since
3947  * it's setting is different if rthdr or source route.
3948  *
3949  * Returns NULL is allocation failed or if the packet would exceed IP_MAXPACKET.
3950  * When it returns NULL it sets errorp.
3951  */
3952 static mblk_t *
3953 icmp_prepend_header_template(conn_t *connp, ip_xmit_attr_t *ixa, mblk_t *mp,
3954     const in6_addr_t *v6src, uint32_t flowinfo, int *errorp)
3955 {
3956 	icmp_t		*icmp = connp->conn_icmp;
3957 	icmp_stack_t	*is = icmp->icmp_is;
3958 	uint_t		pktlen;
3959 	uint_t		copylen;
3960 	uint8_t		*iph;
3961 	uint_t		ip_hdr_length;
3962 	uint32_t	cksum;
3963 	ip_pkt_t	*ipp;
3964 
3965 	ASSERT(MUTEX_HELD(&connp->conn_lock));
3966 
3967 	/*
3968 	 * Copy the header template.
3969 	 */
3970 	copylen = connp->conn_ht_iphc_len;
3971 	pktlen = copylen + msgdsize(mp);
3972 	if (pktlen > IP_MAXPACKET) {
3973 		freemsg(mp);
3974 		*errorp = EMSGSIZE;
3975 		return (NULL);
3976 	}
3977 	ixa->ixa_pktlen = pktlen;
3978 
3979 	/* check/fix buffer config, setup pointers into it */
3980 	iph = mp->b_rptr - copylen;
3981 	if (DB_REF(mp) != 1 || iph < DB_BASE(mp) || !OK_32PTR(iph)) {
3982 		mblk_t *mp1;
3983 
3984 		mp1 = allocb(copylen + is->is_wroff_extra, BPRI_MED);
3985 		if (mp1 == NULL) {
3986 			freemsg(mp);
3987 			*errorp = ENOMEM;
3988 			return (NULL);
3989 		}
3990 		mp1->b_wptr = DB_LIM(mp1);
3991 		mp1->b_cont = mp;
3992 		mp = mp1;
3993 		iph = (mp->b_wptr - copylen);
3994 	}
3995 	mp->b_rptr = iph;
3996 	bcopy(connp->conn_ht_iphc, iph, copylen);
3997 	ip_hdr_length = (uint_t)(connp->conn_ht_ulp - connp->conn_ht_iphc);
3998 
3999 	ixa->ixa_ip_hdr_length = ip_hdr_length;
4000 
4001 	/*
4002 	 * Prepare for ICMPv6 checksum done in IP.
4003 	 *
4004 	 * icmp_build_hdr_template has already massaged any routing header
4005 	 * and placed the result in conn_sum.
4006 	 *
4007 	 * We make it easy for IP to include our pseudo header
4008 	 * by putting our length (and any routing header adjustment)
4009 	 * in the ICMPv6 checksum field.
4010 	 */
4011 	cksum = pktlen - ip_hdr_length;
4012 
4013 	cksum += connp->conn_sum;
4014 	cksum = (cksum >> 16) + (cksum & 0xFFFF);
4015 	ASSERT(cksum < 0x10000);
4016 
4017 	ipp = &connp->conn_xmit_ipp;
4018 	if (ixa->ixa_flags & IXAF_IS_IPV4) {
4019 		ipha_t	*ipha = (ipha_t *)iph;
4020 
4021 		ipha->ipha_length = htons((uint16_t)pktlen);
4022 
4023 		/* if IP_PKTINFO specified an addres it wins over bind() */
4024 		if ((ipp->ipp_fields & IPPF_ADDR) &&
4025 		    IN6_IS_ADDR_V4MAPPED(&ipp->ipp_addr)) {
4026 			ASSERT(ipp->ipp_addr_v4 != INADDR_ANY);
4027 			ipha->ipha_src = ipp->ipp_addr_v4;
4028 		} else {
4029 			IN6_V4MAPPED_TO_IPADDR(v6src, ipha->ipha_src);
4030 		}
4031 	} else {
4032 		ip6_t *ip6h = (ip6_t *)iph;
4033 		uint_t	cksum_offset = 0;
4034 
4035 		ip6h->ip6_plen =  htons((uint16_t)(pktlen - IPV6_HDR_LEN));
4036 
4037 		/* if IP_PKTINFO specified an addres it wins over bind() */
4038 		if ((ipp->ipp_fields & IPPF_ADDR) &&
4039 		    !IN6_IS_ADDR_V4MAPPED(&ipp->ipp_addr)) {
4040 			ASSERT(!IN6_IS_ADDR_UNSPECIFIED(&ipp->ipp_addr));
4041 			ip6h->ip6_src = ipp->ipp_addr;
4042 		} else {
4043 			ip6h->ip6_src = *v6src;
4044 		}
4045 		ip6h->ip6_vcf =
4046 		    (IPV6_DEFAULT_VERS_AND_FLOW & IPV6_VERS_AND_FLOW_MASK) |
4047 		    (flowinfo & ~IPV6_VERS_AND_FLOW_MASK);
4048 		if (ipp->ipp_fields & IPPF_TCLASS) {
4049 			/* Overrides the class part of flowinfo */
4050 			ip6h->ip6_vcf = IPV6_TCLASS_FLOW(ip6h->ip6_vcf,
4051 			    ipp->ipp_tclass);
4052 		}
4053 
4054 		if (ixa->ixa_flags & IXAF_SET_ULP_CKSUM) {
4055 			if (connp->conn_proto == IPPROTO_ICMPV6) {
4056 				cksum_offset = ixa->ixa_ip_hdr_length +
4057 				    offsetof(icmp6_t, icmp6_cksum);
4058 			} else if (ixa->ixa_flags & IXAF_SET_RAW_CKSUM) {
4059 				cksum_offset = ixa->ixa_ip_hdr_length +
4060 				    ixa->ixa_raw_cksum_offset;
4061 			}
4062 		}
4063 		if (cksum_offset != 0) {
4064 			uint16_t *ptr;
4065 
4066 			/* Make sure the checksum fits in the first mblk */
4067 			if (cksum_offset + sizeof (short) > MBLKL(mp)) {
4068 				mblk_t *mp1;
4069 
4070 				mp1 = msgpullup(mp,
4071 				    cksum_offset + sizeof (short));
4072 				freemsg(mp);
4073 				if (mp1 == NULL) {
4074 					*errorp = ENOMEM;
4075 					return (NULL);
4076 				}
4077 				mp = mp1;
4078 				iph = mp->b_rptr;
4079 				ip6h = (ip6_t *)iph;
4080 			}
4081 			ptr = (uint16_t *)(mp->b_rptr + cksum_offset);
4082 			*ptr = htons(cksum);
4083 		}
4084 	}
4085 
4086 	return (mp);
4087 }
4088 
4089 /*
4090  * This routine handles all messages passed downstream.  It either
4091  * consumes the message or passes it downstream; it never queues a
4092  * a message.
4093  */
4094 void
4095 icmp_wput(queue_t *q, mblk_t *mp)
4096 {
4097 	sin6_t		*sin6;
4098 	sin_t		*sin = NULL;
4099 	uint_t		srcid;
4100 	conn_t		*connp = Q_TO_CONN(q);
4101 	icmp_t		*icmp = connp->conn_icmp;
4102 	int		error = 0;
4103 	struct sockaddr	*addr = NULL;
4104 	socklen_t	addrlen;
4105 	icmp_stack_t	*is = icmp->icmp_is;
4106 	struct T_unitdata_req *tudr;
4107 	mblk_t		*data_mp;
4108 	cred_t		*cr;
4109 	pid_t		pid;
4110 
4111 	/*
4112 	 * We directly handle several cases here: T_UNITDATA_REQ message
4113 	 * coming down as M_PROTO/M_PCPROTO and M_DATA messages for connected
4114 	 * socket.
4115 	 */
4116 	switch (DB_TYPE(mp)) {
4117 	case M_DATA:
4118 		/* sockfs never sends down M_DATA */
4119 		BUMP_MIB(&is->is_rawip_mib, rawipOutErrors);
4120 		freemsg(mp);
4121 		return;
4122 
4123 	case M_PROTO:
4124 	case M_PCPROTO:
4125 		tudr = (struct T_unitdata_req *)mp->b_rptr;
4126 		if (MBLKL(mp) < sizeof (*tudr) ||
4127 		    ((t_primp_t)mp->b_rptr)->type != T_UNITDATA_REQ) {
4128 			icmp_wput_other(q, mp);
4129 			return;
4130 		}
4131 		break;
4132 
4133 	default:
4134 		icmp_wput_other(q, mp);
4135 		return;
4136 	}
4137 
4138 	/* Handle valid T_UNITDATA_REQ here */
4139 	data_mp = mp->b_cont;
4140 	if (data_mp == NULL) {
4141 		error = EPROTO;
4142 		goto ud_error2;
4143 	}
4144 	mp->b_cont = NULL;
4145 
4146 	if (!MBLKIN(mp, 0, tudr->DEST_offset + tudr->DEST_length)) {
4147 		error = EADDRNOTAVAIL;
4148 		goto ud_error2;
4149 	}
4150 
4151 	/*
4152 	 * All Solaris components should pass a db_credp
4153 	 * for this message, hence we ASSERT.
4154 	 * On production kernels we return an error to be robust against
4155 	 * random streams modules sitting on top of us.
4156 	 */
4157 	cr = msg_getcred(mp, &pid);
4158 	ASSERT(cr != NULL);
4159 	if (cr == NULL) {
4160 		error = EINVAL;
4161 		goto ud_error2;
4162 	}
4163 
4164 	/*
4165 	 * If a port has not been bound to the stream, fail.
4166 	 * This is not a problem when sockfs is directly
4167 	 * above us, because it will ensure that the socket
4168 	 * is first bound before allowing data to be sent.
4169 	 */
4170 	if (icmp->icmp_state == TS_UNBND) {
4171 		error = EPROTO;
4172 		goto ud_error2;
4173 	}
4174 	addr = (struct sockaddr *)&mp->b_rptr[tudr->DEST_offset];
4175 	addrlen = tudr->DEST_length;
4176 
4177 	switch (connp->conn_family) {
4178 	case AF_INET6:
4179 		sin6 = (sin6_t *)addr;
4180 		if (!OK_32PTR((char *)sin6) || (addrlen != sizeof (sin6_t)) ||
4181 		    (sin6->sin6_family != AF_INET6)) {
4182 			error = EADDRNOTAVAIL;
4183 			goto ud_error2;
4184 		}
4185 
4186 		/* No support for mapped addresses on raw sockets */
4187 		if (IN6_IS_ADDR_V4MAPPED(&sin6->sin6_addr)) {
4188 			error = EADDRNOTAVAIL;
4189 			goto ud_error2;
4190 		}
4191 		srcid = sin6->__sin6_src_id;
4192 
4193 		/*
4194 		 * If the local address is a mapped address return
4195 		 * an error.
4196 		 * It would be possible to send an IPv6 packet but the
4197 		 * response would never make it back to the application
4198 		 * since it is bound to a mapped address.
4199 		 */
4200 		if (IN6_IS_ADDR_V4MAPPED(&connp->conn_saddr_v6)) {
4201 			error = EADDRNOTAVAIL;
4202 			goto ud_error2;
4203 		}
4204 
4205 		if (IN6_IS_ADDR_UNSPECIFIED(&sin6->sin6_addr))
4206 			sin6->sin6_addr = ipv6_loopback;
4207 
4208 		if (tudr->OPT_length != 0) {
4209 			/*
4210 			 * If we are connected then the destination needs to be
4211 			 * the same as the connected one.
4212 			 */
4213 			if (icmp->icmp_state == TS_DATA_XFER &&
4214 			    !conn_same_as_last_v6(connp, sin6)) {
4215 				error = EISCONN;
4216 				goto ud_error2;
4217 			}
4218 			error = icmp_output_ancillary(connp, NULL, sin6,
4219 			    data_mp, mp, NULL, cr, pid);
4220 		} else {
4221 			ip_xmit_attr_t *ixa;
4222 
4223 			/*
4224 			 * We have to allocate an ip_xmit_attr_t before we grab
4225 			 * conn_lock and we need to hold conn_lock once we've
4226 			 * checked conn_same_as_last_v6 to handle concurrent
4227 			 * send* calls on a socket.
4228 			 */
4229 			ixa = conn_get_ixa(connp, B_FALSE);
4230 			if (ixa == NULL) {
4231 				error = ENOMEM;
4232 				goto ud_error2;
4233 			}
4234 			mutex_enter(&connp->conn_lock);
4235 
4236 			if (conn_same_as_last_v6(connp, sin6) &&
4237 			    connp->conn_lastsrcid == srcid &&
4238 			    ipsec_outbound_policy_current(ixa)) {
4239 				/* icmp_output_lastdst drops conn_lock */
4240 				error = icmp_output_lastdst(connp, data_mp, cr,
4241 				    pid, ixa);
4242 			} else {
4243 				/* icmp_output_newdst drops conn_lock */
4244 				error = icmp_output_newdst(connp, data_mp, NULL,
4245 				    sin6, cr, pid, ixa);
4246 			}
4247 			ASSERT(MUTEX_NOT_HELD(&connp->conn_lock));
4248 		}
4249 		if (error == 0) {
4250 			freeb(mp);
4251 			return;
4252 		}
4253 		break;
4254 
4255 	case AF_INET:
4256 		sin = (sin_t *)addr;
4257 		if ((!OK_32PTR((char *)sin) || addrlen != sizeof (sin_t)) ||
4258 		    (sin->sin_family != AF_INET)) {
4259 			error = EADDRNOTAVAIL;
4260 			goto ud_error2;
4261 		}
4262 		if (sin->sin_addr.s_addr == INADDR_ANY)
4263 			sin->sin_addr.s_addr = htonl(INADDR_LOOPBACK);
4264 
4265 		/* Protocol 255 contains full IP headers */
4266 		/* Read without holding lock */
4267 		if (icmp->icmp_hdrincl) {
4268 			if (MBLKL(data_mp) < IP_SIMPLE_HDR_LENGTH) {
4269 				if (!pullupmsg(data_mp, IP_SIMPLE_HDR_LENGTH)) {
4270 					error = EINVAL;
4271 					goto ud_error2;
4272 				}
4273 			}
4274 			error = icmp_output_hdrincl(connp, data_mp, cr, pid);
4275 			if (error == 0) {
4276 				freeb(mp);
4277 				return;
4278 			}
4279 			/* data_mp consumed above */
4280 			data_mp = NULL;
4281 			goto ud_error2;
4282 		}
4283 
4284 		if (tudr->OPT_length != 0) {
4285 			/*
4286 			 * If we are connected then the destination needs to be
4287 			 * the same as the connected one.
4288 			 */
4289 			if (icmp->icmp_state == TS_DATA_XFER &&
4290 			    !conn_same_as_last_v4(connp, sin)) {
4291 				error = EISCONN;
4292 				goto ud_error2;
4293 			}
4294 			error = icmp_output_ancillary(connp, sin, NULL,
4295 			    data_mp, mp, NULL, cr, pid);
4296 		} else {
4297 			ip_xmit_attr_t *ixa;
4298 
4299 			/*
4300 			 * We have to allocate an ip_xmit_attr_t before we grab
4301 			 * conn_lock and we need to hold conn_lock once we've
4302 			 * checked conn_same_as_last_v4 to handle concurrent
4303 			 * send* calls on a socket.
4304 			 */
4305 			ixa = conn_get_ixa(connp, B_FALSE);
4306 			if (ixa == NULL) {
4307 				error = ENOMEM;
4308 				goto ud_error2;
4309 			}
4310 			mutex_enter(&connp->conn_lock);
4311 
4312 			if (conn_same_as_last_v4(connp, sin) &&
4313 			    ipsec_outbound_policy_current(ixa)) {
4314 				/* icmp_output_lastdst drops conn_lock */
4315 				error = icmp_output_lastdst(connp, data_mp, cr,
4316 				    pid, ixa);
4317 			} else {
4318 				/* icmp_output_newdst drops conn_lock */
4319 				error = icmp_output_newdst(connp, data_mp, sin,
4320 				    NULL, cr, pid, ixa);
4321 			}
4322 			ASSERT(MUTEX_NOT_HELD(&connp->conn_lock));
4323 		}
4324 		if (error == 0) {
4325 			freeb(mp);
4326 			return;
4327 		}
4328 		break;
4329 	}
4330 	ASSERT(mp != NULL);
4331 	/* mp is freed by the following routine */
4332 	icmp_ud_err(q, mp, (t_scalar_t)error);
4333 	return;
4334 
4335 ud_error2:
4336 	BUMP_MIB(&is->is_rawip_mib, rawipOutErrors);
4337 	freemsg(data_mp);
4338 	ASSERT(mp != NULL);
4339 	/* mp is freed by the following routine */
4340 	icmp_ud_err(q, mp, (t_scalar_t)error);
4341 }
4342 
4343 /*
4344  * Handle the case of the IP address or flow label being different
4345  * for both IPv4 and IPv6.
4346  *
4347  * NOTE: The caller must hold conn_lock and we drop it here.
4348  */
4349 static int
4350 icmp_output_newdst(conn_t *connp, mblk_t *data_mp, sin_t *sin, sin6_t *sin6,
4351     cred_t *cr, pid_t pid, ip_xmit_attr_t *ixa)
4352 {
4353 	icmp_t		*icmp = connp->conn_icmp;
4354 	icmp_stack_t	*is = icmp->icmp_is;
4355 	int		error;
4356 	ip_xmit_attr_t	*oldixa;
4357 	boolean_t	do_ipsec;
4358 	uint_t		srcid;
4359 	uint32_t	flowinfo;
4360 	in6_addr_t	v6src;
4361 	in6_addr_t	v6dst;
4362 	in6_addr_t	v6nexthop;
4363 	in_port_t	dstport;
4364 
4365 	ASSERT(MUTEX_HELD(&connp->conn_lock));
4366 	ASSERT(ixa != NULL);
4367 
4368 	/*
4369 	 * We hold conn_lock across all the use and modifications of
4370 	 * the conn_lastdst, conn_ixa, and conn_xmit_ipp to ensure that they
4371 	 * stay consistent.
4372 	 */
4373 
4374 	ASSERT(cr != NULL);
4375 	ASSERT(!(ixa->ixa_free_flags & IXA_FREE_CRED));
4376 	ixa->ixa_cred = cr;
4377 	ixa->ixa_cpid = pid;
4378 	if (is_system_labeled()) {
4379 		/* We need to restart with a label based on the cred */
4380 		ip_xmit_attr_restore_tsl(ixa, ixa->ixa_cred);
4381 	}
4382 	/*
4383 	 * If we are connected then the destination needs to be the
4384 	 * same as the connected one, which is not the case here since we
4385 	 * checked for that above.
4386 	 */
4387 	if (icmp->icmp_state == TS_DATA_XFER) {
4388 		mutex_exit(&connp->conn_lock);
4389 		error = EISCONN;
4390 		goto ud_error;
4391 	}
4392 
4393 	/* In case previous destination was multicast or multirt */
4394 	ip_attr_newdst(ixa);
4395 
4396 	/*
4397 	 * If laddr is unspecified then we look at sin6_src_id.
4398 	 * We will give precedence to a source address set with IPV6_PKTINFO
4399 	 * (aka IPPF_ADDR) but that is handled in build_hdrs. However, we don't
4400 	 * want ip_attr_connect to select a source (since it can fail) when
4401 	 * IPV6_PKTINFO is specified.
4402 	 * If this doesn't result in a source address then we get a source
4403 	 * from ip_attr_connect() below.
4404 	 */
4405 	v6src = connp->conn_saddr_v6;
4406 	if (sin != NULL) {
4407 		IN6_IPADDR_TO_V4MAPPED(sin->sin_addr.s_addr, &v6dst);
4408 		dstport = sin->sin_port;
4409 		flowinfo = 0;
4410 		srcid = 0;
4411 		ixa->ixa_flags &= ~IXAF_SCOPEID_SET;
4412 		if (srcid != 0 && V4_PART_OF_V6(&v6src) == INADDR_ANY) {
4413 			ip_srcid_find_id(srcid, &v6src, IPCL_ZONEID(connp),
4414 			    connp->conn_netstack);
4415 		}
4416 		ixa->ixa_flags |= IXAF_IS_IPV4;
4417 	} else {
4418 		v6dst = sin6->sin6_addr;
4419 		dstport = sin6->sin6_port;
4420 		flowinfo = sin6->sin6_flowinfo;
4421 		srcid = sin6->__sin6_src_id;
4422 		if (IN6_IS_ADDR_LINKSCOPE(&v6dst) && sin6->sin6_scope_id != 0) {
4423 			ixa->ixa_scopeid = sin6->sin6_scope_id;
4424 			ixa->ixa_flags |= IXAF_SCOPEID_SET;
4425 		} else {
4426 			ixa->ixa_flags &= ~IXAF_SCOPEID_SET;
4427 		}
4428 		if (srcid != 0 && IN6_IS_ADDR_UNSPECIFIED(&v6src)) {
4429 			ip_srcid_find_id(srcid, &v6src, IPCL_ZONEID(connp),
4430 			    connp->conn_netstack);
4431 		}
4432 		if (IN6_IS_ADDR_V4MAPPED(&v6dst))
4433 			ixa->ixa_flags |= IXAF_IS_IPV4;
4434 		else
4435 			ixa->ixa_flags &= ~IXAF_IS_IPV4;
4436 	}
4437 	/* Handle IPV6_PKTINFO setting source address. */
4438 	if (IN6_IS_ADDR_UNSPECIFIED(&v6src) &&
4439 	    (connp->conn_xmit_ipp.ipp_fields & IPPF_ADDR)) {
4440 		ip_pkt_t *ipp = &connp->conn_xmit_ipp;
4441 
4442 		if (ixa->ixa_flags & IXAF_IS_IPV4) {
4443 			if (IN6_IS_ADDR_V4MAPPED(&ipp->ipp_addr))
4444 				v6src = ipp->ipp_addr;
4445 		} else {
4446 			if (!IN6_IS_ADDR_V4MAPPED(&ipp->ipp_addr))
4447 				v6src = ipp->ipp_addr;
4448 		}
4449 	}
4450 
4451 	/* Defer IPsec if it might need to look at ICMP type/code */
4452 	switch (ixa->ixa_protocol) {
4453 	case IPPROTO_ICMP:
4454 	case IPPROTO_ICMPV6:
4455 		do_ipsec = B_FALSE;
4456 		break;
4457 	default:
4458 		do_ipsec = B_TRUE;
4459 	}
4460 
4461 	ip_attr_nexthop(&connp->conn_xmit_ipp, ixa, &v6dst, &v6nexthop);
4462 	mutex_exit(&connp->conn_lock);
4463 
4464 	error = ip_attr_connect(connp, ixa, &v6src, &v6dst, &v6nexthop, dstport,
4465 	    &v6src, NULL, IPDF_ALLOW_MCBC | IPDF_VERIFY_DST |
4466 	    (do_ipsec ? IPDF_IPSEC : 0));
4467 	switch (error) {
4468 	case 0:
4469 		break;
4470 	case EADDRNOTAVAIL:
4471 		/*
4472 		 * IXAF_VERIFY_SOURCE tells us to pick a better source.
4473 		 * Don't have the application see that errno
4474 		 */
4475 		error = ENETUNREACH;
4476 		goto failed;
4477 	case ENETDOWN:
4478 		/*
4479 		 * Have !ipif_addr_ready address; drop packet silently
4480 		 * until we can get applications to not send until we
4481 		 * are ready.
4482 		 */
4483 		error = 0;
4484 		goto failed;
4485 	case EHOSTUNREACH:
4486 	case ENETUNREACH:
4487 		if (ixa->ixa_ire != NULL) {
4488 			/*
4489 			 * Let conn_ip_output/ire_send_noroute return
4490 			 * the error and send any local ICMP error.
4491 			 */
4492 			error = 0;
4493 			break;
4494 		}
4495 		/* FALLTHRU */
4496 	default:
4497 	failed:
4498 		goto ud_error;
4499 	}
4500 
4501 	mutex_enter(&connp->conn_lock);
4502 	/*
4503 	 * While we dropped the lock some other thread might have connected
4504 	 * this socket. If so we bail out with EISCONN to ensure that the
4505 	 * connecting thread is the one that updates conn_ixa, conn_ht_*
4506 	 * and conn_*last*.
4507 	 */
4508 	if (icmp->icmp_state == TS_DATA_XFER) {
4509 		mutex_exit(&connp->conn_lock);
4510 		error = EISCONN;
4511 		goto ud_error;
4512 	}
4513 
4514 	/*
4515 	 * We need to rebuild the headers if
4516 	 *  - we are labeling packets (could be different for different
4517 	 *    destinations)
4518 	 *  - we have a source route (or routing header) since we need to
4519 	 *    massage that to get the pseudo-header checksum
4520 	 *  - a socket option with COA_HEADER_CHANGED has been set which
4521 	 *    set conn_v6lastdst to zero.
4522 	 *
4523 	 * Otherwise the prepend function will just update the src, dst,
4524 	 * and flow label.
4525 	 */
4526 	if (is_system_labeled()) {
4527 		/* TX MLP requires SCM_UCRED and don't have that here */
4528 		if (connp->conn_mlp_type != mlptSingle) {
4529 			mutex_exit(&connp->conn_lock);
4530 			error = ECONNREFUSED;
4531 			goto ud_error;
4532 		}
4533 		/*
4534 		 * Check whether Trusted Solaris policy allows communication
4535 		 * with this host, and pretend that the destination is
4536 		 * unreachable if not.
4537 		 * Compute any needed label and place it in ipp_label_v4/v6.
4538 		 *
4539 		 * Later conn_build_hdr_template/conn_prepend_hdr takes
4540 		 * ipp_label_v4/v6 to form the packet.
4541 		 *
4542 		 * Tsol note: Since we hold conn_lock we know no other
4543 		 * thread manipulates conn_xmit_ipp.
4544 		 */
4545 		error = conn_update_label(connp, ixa, &v6dst,
4546 		    &connp->conn_xmit_ipp);
4547 		if (error != 0) {
4548 			mutex_exit(&connp->conn_lock);
4549 			goto ud_error;
4550 		}
4551 		/* Rebuild the header template */
4552 		error = icmp_build_hdr_template(connp, &v6src, &v6dst,
4553 		    flowinfo);
4554 		if (error != 0) {
4555 			mutex_exit(&connp->conn_lock);
4556 			goto ud_error;
4557 		}
4558 	} else if (connp->conn_xmit_ipp.ipp_fields &
4559 	    (IPPF_IPV4_OPTIONS|IPPF_RTHDR) ||
4560 	    IN6_IS_ADDR_UNSPECIFIED(&connp->conn_v6lastdst)) {
4561 		/* Rebuild the header template */
4562 		error = icmp_build_hdr_template(connp, &v6src, &v6dst,
4563 		    flowinfo);
4564 		if (error != 0) {
4565 			mutex_exit(&connp->conn_lock);
4566 			goto ud_error;
4567 		}
4568 	} else {
4569 		/* Simply update the destination address if no source route */
4570 		if (ixa->ixa_flags & IXAF_IS_IPV4) {
4571 			ipha_t	*ipha = (ipha_t *)connp->conn_ht_iphc;
4572 
4573 			IN6_V4MAPPED_TO_IPADDR(&v6dst, ipha->ipha_dst);
4574 			if (ixa->ixa_flags & IXAF_PMTU_IPV4_DF) {
4575 				ipha->ipha_fragment_offset_and_flags |=
4576 				    IPH_DF_HTONS;
4577 			} else {
4578 				ipha->ipha_fragment_offset_and_flags &=
4579 				    ~IPH_DF_HTONS;
4580 			}
4581 		} else {
4582 			ip6_t *ip6h = (ip6_t *)connp->conn_ht_iphc;
4583 			ip6h->ip6_dst = v6dst;
4584 		}
4585 	}
4586 
4587 	/*
4588 	 * Remember the dst etc which corresponds to the built header
4589 	 * template and conn_ixa.
4590 	 */
4591 	oldixa = conn_replace_ixa(connp, ixa);
4592 	connp->conn_v6lastdst = v6dst;
4593 	connp->conn_lastflowinfo = flowinfo;
4594 	connp->conn_lastscopeid = ixa->ixa_scopeid;
4595 	connp->conn_lastsrcid = srcid;
4596 	/* Also remember a source to use together with lastdst */
4597 	connp->conn_v6lastsrc = v6src;
4598 
4599 	data_mp = icmp_prepend_header_template(connp, ixa, data_mp, &v6src,
4600 	    flowinfo, &error);
4601 
4602 	/* Done with conn_t */
4603 	mutex_exit(&connp->conn_lock);
4604 	ixa_refrele(oldixa);
4605 
4606 	if (data_mp == NULL) {
4607 		ASSERT(error != 0);
4608 		goto ud_error;
4609 	}
4610 
4611 	if (!do_ipsec) {
4612 		/* Policy might differ for different ICMP type/code */
4613 		data_mp = icmp_output_attach_policy(data_mp, connp, ixa);
4614 		if (data_mp == NULL) {
4615 			BUMP_MIB(&is->is_rawip_mib, rawipOutErrors);
4616 			error = EHOSTUNREACH;	/* IPsec policy failure */
4617 			goto done;
4618 		}
4619 	}
4620 
4621 	/* We're done.  Pass the packet to ip. */
4622 	BUMP_MIB(&is->is_rawip_mib, rawipOutDatagrams);
4623 
4624 	error = conn_ip_output(data_mp, ixa);
4625 	/* No rawipOutErrors if an error since IP increases its error counter */
4626 	switch (error) {
4627 	case 0:
4628 		break;
4629 	case EWOULDBLOCK:
4630 		(void) ixa_check_drain_insert(connp, ixa);
4631 		error = 0;
4632 		break;
4633 	case EADDRNOTAVAIL:
4634 		/*
4635 		 * IXAF_VERIFY_SOURCE tells us to pick a better source.
4636 		 * Don't have the application see that errno
4637 		 */
4638 		error = ENETUNREACH;
4639 		/* FALLTHRU */
4640 	default:
4641 		mutex_enter(&connp->conn_lock);
4642 		/*
4643 		 * Clear the source and v6lastdst so we call ip_attr_connect
4644 		 * for the next packet and try to pick a better source.
4645 		 */
4646 		if (connp->conn_mcbc_bind)
4647 			connp->conn_saddr_v6 = ipv6_all_zeros;
4648 		else
4649 			connp->conn_saddr_v6 = connp->conn_bound_addr_v6;
4650 		connp->conn_v6lastdst = ipv6_all_zeros;
4651 		mutex_exit(&connp->conn_lock);
4652 		break;
4653 	}
4654 done:
4655 	ASSERT(!(ixa->ixa_free_flags & IXA_FREE_CRED));
4656 	ixa->ixa_cred = connp->conn_cred;	/* Restore */
4657 	ixa->ixa_cpid = connp->conn_cpid;
4658 	ixa_refrele(ixa);
4659 	return (error);
4660 
4661 ud_error:
4662 	ASSERT(!(ixa->ixa_free_flags & IXA_FREE_CRED));
4663 	ixa->ixa_cred = connp->conn_cred;	/* Restore */
4664 	ixa->ixa_cpid = connp->conn_cpid;
4665 	ixa_refrele(ixa);
4666 
4667 	BUMP_MIB(&is->is_rawip_mib, rawipOutErrors);
4668 	freemsg(data_mp);
4669 	return (error);
4670 }
4671 
4672 /* ARGSUSED */
4673 static void
4674 icmp_wput_fallback(queue_t *q, mblk_t *mp)
4675 {
4676 #ifdef DEBUG
4677 	cmn_err(CE_CONT, "icmp_wput_fallback: Message during fallback \n");
4678 #endif
4679 	freemsg(mp);
4680 }
4681 
4682 static void
4683 icmp_wput_other(queue_t *q, mblk_t *mp)
4684 {
4685 	uchar_t	*rptr = mp->b_rptr;
4686 	struct iocblk *iocp;
4687 	conn_t	*connp = Q_TO_CONN(q);
4688 	icmp_t	*icmp = connp->conn_icmp;
4689 	cred_t *cr;
4690 
4691 	switch (mp->b_datap->db_type) {
4692 	case M_PROTO:
4693 	case M_PCPROTO:
4694 		if (mp->b_wptr - rptr < sizeof (t_scalar_t)) {
4695 			/*
4696 			 * If the message does not contain a PRIM_type,
4697 			 * throw it away.
4698 			 */
4699 			freemsg(mp);
4700 			return;
4701 		}
4702 		switch (((t_primp_t)rptr)->type) {
4703 		case T_ADDR_REQ:
4704 			icmp_addr_req(q, mp);
4705 			return;
4706 		case O_T_BIND_REQ:
4707 		case T_BIND_REQ:
4708 			icmp_tpi_bind(q, mp);
4709 			return;
4710 		case T_CONN_REQ:
4711 			icmp_tpi_connect(q, mp);
4712 			return;
4713 		case T_CAPABILITY_REQ:
4714 			icmp_capability_req(q, mp);
4715 			return;
4716 		case T_INFO_REQ:
4717 			icmp_info_req(q, mp);
4718 			return;
4719 		case T_UNITDATA_REQ:
4720 			/*
4721 			 * If a T_UNITDATA_REQ gets here, the address must
4722 			 * be bad.  Valid T_UNITDATA_REQs are handled
4723 			 * in icmp_wput.
4724 			 */
4725 			icmp_ud_err(q, mp, EADDRNOTAVAIL);
4726 			return;
4727 		case T_UNBIND_REQ:
4728 			icmp_tpi_unbind(q, mp);
4729 			return;
4730 		case T_SVR4_OPTMGMT_REQ:
4731 			/*
4732 			 * All Solaris components should pass a db_credp
4733 			 * for this TPI message, hence we ASSERT.
4734 			 * But in case there is some other M_PROTO that looks
4735 			 * like a TPI message sent by some other kernel
4736 			 * component, we check and return an error.
4737 			 */
4738 			cr = msg_getcred(mp, NULL);
4739 			ASSERT(cr != NULL);
4740 			if (cr == NULL) {
4741 				icmp_err_ack(q, mp, TSYSERR, EINVAL);
4742 				return;
4743 			}
4744 
4745 			if (!snmpcom_req(q, mp, icmp_snmp_set, ip_snmp_get,
4746 			    cr)) {
4747 				svr4_optcom_req(q, mp, cr, &icmp_opt_obj);
4748 			}
4749 			return;
4750 
4751 		case T_OPTMGMT_REQ:
4752 			/*
4753 			 * All Solaris components should pass a db_credp
4754 			 * for this TPI message, hence we ASSERT.
4755 			 * But in case there is some other M_PROTO that looks
4756 			 * like a TPI message sent by some other kernel
4757 			 * component, we check and return an error.
4758 			 */
4759 			cr = msg_getcred(mp, NULL);
4760 			ASSERT(cr != NULL);
4761 			if (cr == NULL) {
4762 				icmp_err_ack(q, mp, TSYSERR, EINVAL);
4763 				return;
4764 			}
4765 			tpi_optcom_req(q, mp, cr, &icmp_opt_obj);
4766 			return;
4767 
4768 		case T_DISCON_REQ:
4769 			icmp_tpi_disconnect(q, mp);
4770 			return;
4771 
4772 		/* The following TPI message is not supported by icmp. */
4773 		case O_T_CONN_RES:
4774 		case T_CONN_RES:
4775 			icmp_err_ack(q, mp, TNOTSUPPORT, 0);
4776 			return;
4777 
4778 		/* The following 3 TPI requests are illegal for icmp. */
4779 		case T_DATA_REQ:
4780 		case T_EXDATA_REQ:
4781 		case T_ORDREL_REQ:
4782 			icmp_err_ack(q, mp, TNOTSUPPORT, 0);
4783 			return;
4784 		default:
4785 			break;
4786 		}
4787 		break;
4788 	case M_FLUSH:
4789 		if (*rptr & FLUSHW)
4790 			flushq(q, FLUSHDATA);
4791 		break;
4792 	case M_IOCTL:
4793 		iocp = (struct iocblk *)mp->b_rptr;
4794 		switch (iocp->ioc_cmd) {
4795 		case TI_GETPEERNAME:
4796 			if (icmp->icmp_state != TS_DATA_XFER) {
4797 				/*
4798 				 * If a default destination address has not
4799 				 * been associated with the stream, then we
4800 				 * don't know the peer's name.
4801 				 */
4802 				iocp->ioc_error = ENOTCONN;
4803 				iocp->ioc_count = 0;
4804 				mp->b_datap->db_type = M_IOCACK;
4805 				qreply(q, mp);
4806 				return;
4807 			}
4808 			/* FALLTHRU */
4809 		case TI_GETMYNAME:
4810 			/*
4811 			 * For TI_GETPEERNAME and TI_GETMYNAME, we first
4812 			 * need to copyin the user's strbuf structure.
4813 			 * Processing will continue in the M_IOCDATA case
4814 			 * below.
4815 			 */
4816 			mi_copyin(q, mp, NULL,
4817 			    SIZEOF_STRUCT(strbuf, iocp->ioc_flag));
4818 			return;
4819 		default:
4820 			break;
4821 		}
4822 		break;
4823 	case M_IOCDATA:
4824 		icmp_wput_iocdata(q, mp);
4825 		return;
4826 	default:
4827 		/* Unrecognized messages are passed through without change. */
4828 		break;
4829 	}
4830 	ip_wput_nondata(q, mp);
4831 }
4832 
4833 /*
4834  * icmp_wput_iocdata is called by icmp_wput_other to handle all M_IOCDATA
4835  * messages.
4836  */
4837 static void
4838 icmp_wput_iocdata(queue_t *q, mblk_t *mp)
4839 {
4840 	mblk_t		*mp1;
4841 	STRUCT_HANDLE(strbuf, sb);
4842 	uint_t		addrlen;
4843 	conn_t		*connp = Q_TO_CONN(q);
4844 	icmp_t		*icmp = connp->conn_icmp;
4845 
4846 	/* Make sure it is one of ours. */
4847 	switch (((struct iocblk *)mp->b_rptr)->ioc_cmd) {
4848 	case TI_GETMYNAME:
4849 	case TI_GETPEERNAME:
4850 		break;
4851 	default:
4852 		ip_wput_nondata(q, mp);
4853 		return;
4854 	}
4855 
4856 	switch (mi_copy_state(q, mp, &mp1)) {
4857 	case -1:
4858 		return;
4859 	case MI_COPY_CASE(MI_COPY_IN, 1):
4860 		break;
4861 	case MI_COPY_CASE(MI_COPY_OUT, 1):
4862 		/*
4863 		 * The address has been copied out, so now
4864 		 * copyout the strbuf.
4865 		 */
4866 		mi_copyout(q, mp);
4867 		return;
4868 	case MI_COPY_CASE(MI_COPY_OUT, 2):
4869 		/*
4870 		 * The address and strbuf have been copied out.
4871 		 * We're done, so just acknowledge the original
4872 		 * M_IOCTL.
4873 		 */
4874 		mi_copy_done(q, mp, 0);
4875 		return;
4876 	default:
4877 		/*
4878 		 * Something strange has happened, so acknowledge
4879 		 * the original M_IOCTL with an EPROTO error.
4880 		 */
4881 		mi_copy_done(q, mp, EPROTO);
4882 		return;
4883 	}
4884 
4885 	/*
4886 	 * Now we have the strbuf structure for TI_GETMYNAME
4887 	 * and TI_GETPEERNAME.  Next we copyout the requested
4888 	 * address and then we'll copyout the strbuf.
4889 	 */
4890 	STRUCT_SET_HANDLE(sb, ((struct iocblk *)mp->b_rptr)->ioc_flag,
4891 	    (void *)mp1->b_rptr);
4892 
4893 	if (connp->conn_family == AF_INET)
4894 		addrlen = sizeof (sin_t);
4895 	else
4896 		addrlen = sizeof (sin6_t);
4897 
4898 	if (STRUCT_FGET(sb, maxlen) < addrlen) {
4899 		mi_copy_done(q, mp, EINVAL);
4900 		return;
4901 	}
4902 	switch (((struct iocblk *)mp->b_rptr)->ioc_cmd) {
4903 	case TI_GETMYNAME:
4904 		break;
4905 	case TI_GETPEERNAME:
4906 		if (icmp->icmp_state != TS_DATA_XFER) {
4907 			mi_copy_done(q, mp, ENOTCONN);
4908 			return;
4909 		}
4910 		break;
4911 	default:
4912 		mi_copy_done(q, mp, EPROTO);
4913 		return;
4914 	}
4915 	mp1 = mi_copyout_alloc(q, mp, STRUCT_FGETP(sb, buf), addrlen, B_TRUE);
4916 	if (!mp1)
4917 		return;
4918 
4919 	STRUCT_FSET(sb, len, addrlen);
4920 	switch (((struct iocblk *)mp->b_rptr)->ioc_cmd) {
4921 	case TI_GETMYNAME:
4922 		(void) conn_getsockname(connp, (struct sockaddr *)mp1->b_wptr,
4923 		    &addrlen);
4924 		break;
4925 	case TI_GETPEERNAME:
4926 		(void) conn_getpeername(connp, (struct sockaddr *)mp1->b_wptr,
4927 		    &addrlen);
4928 		break;
4929 	}
4930 	mp1->b_wptr += addrlen;
4931 	/* Copy out the address */
4932 	mi_copyout(q, mp);
4933 }
4934 
4935 void
4936 icmp_ddi_g_init(void)
4937 {
4938 	icmp_max_optsize = optcom_max_optsize(icmp_opt_obj.odb_opt_des_arr,
4939 	    icmp_opt_obj.odb_opt_arr_cnt);
4940 
4941 	/*
4942 	 * We want to be informed each time a stack is created or
4943 	 * destroyed in the kernel, so we can maintain the
4944 	 * set of icmp_stack_t's.
4945 	 */
4946 	netstack_register(NS_ICMP, rawip_stack_init, NULL, rawip_stack_fini);
4947 }
4948 
4949 void
4950 icmp_ddi_g_destroy(void)
4951 {
4952 	netstack_unregister(NS_ICMP);
4953 }
4954 
4955 #define	INET_NAME	"ip"
4956 
4957 /*
4958  * Initialize the ICMP stack instance.
4959  */
4960 static void *
4961 rawip_stack_init(netstackid_t stackid, netstack_t *ns)
4962 {
4963 	icmp_stack_t	*is;
4964 	int		error = 0;
4965 	size_t		arrsz;
4966 	major_t		major;
4967 
4968 	is = (icmp_stack_t *)kmem_zalloc(sizeof (*is), KM_SLEEP);
4969 	is->is_netstack = ns;
4970 
4971 	arrsz = sizeof (icmp_propinfo_tbl);
4972 	is->is_propinfo_tbl = (mod_prop_info_t *)kmem_alloc(arrsz, KM_SLEEP);
4973 	bcopy(icmp_propinfo_tbl, is->is_propinfo_tbl, arrsz);
4974 
4975 	is->is_ksp = rawip_kstat_init(stackid);
4976 
4977 	major = mod_name_to_major(INET_NAME);
4978 	error = ldi_ident_from_major(major, &is->is_ldi_ident);
4979 	ASSERT(error == 0);
4980 	return (is);
4981 }
4982 
4983 /*
4984  * Free the ICMP stack instance.
4985  */
4986 static void
4987 rawip_stack_fini(netstackid_t stackid, void *arg)
4988 {
4989 	icmp_stack_t *is = (icmp_stack_t *)arg;
4990 
4991 	kmem_free(is->is_propinfo_tbl, sizeof (icmp_propinfo_tbl));
4992 	is->is_propinfo_tbl = NULL;
4993 
4994 	rawip_kstat_fini(stackid, is->is_ksp);
4995 	is->is_ksp = NULL;
4996 	ldi_ident_release(is->is_ldi_ident);
4997 	kmem_free(is, sizeof (*is));
4998 }
4999 
5000 static void *
5001 rawip_kstat_init(netstackid_t stackid) {
5002 	kstat_t	*ksp;
5003 
5004 	rawip_named_kstat_t template = {
5005 		{ "inDatagrams",	KSTAT_DATA_UINT32, 0 },
5006 		{ "inCksumErrs",	KSTAT_DATA_UINT32, 0 },
5007 		{ "inErrors",		KSTAT_DATA_UINT32, 0 },
5008 		{ "outDatagrams",	KSTAT_DATA_UINT32, 0 },
5009 		{ "outErrors",		KSTAT_DATA_UINT32, 0 },
5010 	};
5011 
5012 	ksp = kstat_create_netstack("icmp", 0, "rawip", "mib2",
5013 					KSTAT_TYPE_NAMED,
5014 					NUM_OF_FIELDS(rawip_named_kstat_t),
5015 					0, stackid);
5016 	if (ksp == NULL || ksp->ks_data == NULL)
5017 		return (NULL);
5018 
5019 	bcopy(&template, ksp->ks_data, sizeof (template));
5020 	ksp->ks_update = rawip_kstat_update;
5021 	ksp->ks_private = (void *)(uintptr_t)stackid;
5022 
5023 	kstat_install(ksp);
5024 	return (ksp);
5025 }
5026 
5027 static void
5028 rawip_kstat_fini(netstackid_t stackid, kstat_t *ksp)
5029 {
5030 	if (ksp != NULL) {
5031 		ASSERT(stackid == (netstackid_t)(uintptr_t)ksp->ks_private);
5032 		kstat_delete_netstack(ksp, stackid);
5033 	}
5034 }
5035 
5036 static int
5037 rawip_kstat_update(kstat_t *ksp, int rw)
5038 {
5039 	rawip_named_kstat_t *rawipkp;
5040 	netstackid_t	stackid = (netstackid_t)(uintptr_t)ksp->ks_private;
5041 	netstack_t	*ns;
5042 	icmp_stack_t	*is;
5043 
5044 	if ((ksp == NULL) || (ksp->ks_data == NULL))
5045 		return (EIO);
5046 
5047 	if (rw == KSTAT_WRITE)
5048 		return (EACCES);
5049 
5050 	rawipkp = (rawip_named_kstat_t *)ksp->ks_data;
5051 
5052 	ns = netstack_find_by_stackid(stackid);
5053 	if (ns == NULL)
5054 		return (-1);
5055 	is = ns->netstack_icmp;
5056 	if (is == NULL) {
5057 		netstack_rele(ns);
5058 		return (-1);
5059 	}
5060 	rawipkp->inDatagrams.value.ui32 =  is->is_rawip_mib.rawipInDatagrams;
5061 	rawipkp->inCksumErrs.value.ui32 =  is->is_rawip_mib.rawipInCksumErrs;
5062 	rawipkp->inErrors.value.ui32 =	   is->is_rawip_mib.rawipInErrors;
5063 	rawipkp->outDatagrams.value.ui32 = is->is_rawip_mib.rawipOutDatagrams;
5064 	rawipkp->outErrors.value.ui32 =	   is->is_rawip_mib.rawipOutErrors;
5065 	netstack_rele(ns);
5066 	return (0);
5067 }
5068 
5069 /* ARGSUSED */
5070 int
5071 rawip_accept(sock_lower_handle_t lproto_handle,
5072     sock_lower_handle_t eproto_handle, sock_upper_handle_t sock_handle,
5073     cred_t *cr)
5074 {
5075 	return (EOPNOTSUPP);
5076 }
5077 
5078 /* ARGSUSED */
5079 int
5080 rawip_bind(sock_lower_handle_t proto_handle, struct sockaddr *sa,
5081     socklen_t len, cred_t *cr)
5082 {
5083 	conn_t  *connp = (conn_t *)proto_handle;
5084 	int	error;
5085 
5086 	/* All Solaris components should pass a cred for this operation. */
5087 	ASSERT(cr != NULL);
5088 
5089 	/* Binding to a NULL address really means unbind */
5090 	if (sa == NULL)
5091 		error = rawip_do_unbind(connp);
5092 	else
5093 		error = rawip_do_bind(connp, sa, len);
5094 
5095 	if (error < 0) {
5096 		if (error == -TOUTSTATE)
5097 			error = EINVAL;
5098 		else
5099 			error = proto_tlitosyserr(-error);
5100 	}
5101 	return (error);
5102 }
5103 
5104 static int
5105 rawip_implicit_bind(conn_t *connp)
5106 {
5107 	sin6_t sin6addr;
5108 	sin_t *sin;
5109 	sin6_t *sin6;
5110 	socklen_t len;
5111 	int error;
5112 
5113 	if (connp->conn_family == AF_INET) {
5114 		len = sizeof (struct sockaddr_in);
5115 		sin = (sin_t *)&sin6addr;
5116 		*sin = sin_null;
5117 		sin->sin_family = AF_INET;
5118 		sin->sin_addr.s_addr = INADDR_ANY;
5119 	} else {
5120 		ASSERT(connp->conn_family == AF_INET6);
5121 		len = sizeof (sin6_t);
5122 		sin6 = (sin6_t *)&sin6addr;
5123 		*sin6 = sin6_null;
5124 		sin6->sin6_family = AF_INET6;
5125 		V6_SET_ZERO(sin6->sin6_addr);
5126 	}
5127 
5128 	error = rawip_do_bind(connp, (struct sockaddr *)&sin6addr, len);
5129 
5130 	return ((error < 0) ? proto_tlitosyserr(-error) : error);
5131 }
5132 
5133 static int
5134 rawip_unbind(conn_t *connp)
5135 {
5136 	int error;
5137 
5138 	error = rawip_do_unbind(connp);
5139 	if (error < 0) {
5140 		error = proto_tlitosyserr(-error);
5141 	}
5142 	return (error);
5143 }
5144 
5145 /* ARGSUSED */
5146 int
5147 rawip_listen(sock_lower_handle_t proto_handle, int backlog, cred_t *cr)
5148 {
5149 	return (EOPNOTSUPP);
5150 }
5151 
5152 int
5153 rawip_connect(sock_lower_handle_t proto_handle, const struct sockaddr *sa,
5154     socklen_t len, sock_connid_t *id, cred_t *cr)
5155 {
5156 	conn_t	*connp = (conn_t *)proto_handle;
5157 	icmp_t *icmp = connp->conn_icmp;
5158 	int	error;
5159 	boolean_t did_bind = B_FALSE;
5160 	pid_t	pid = curproc->p_pid;
5161 
5162 	/* All Solaris components should pass a cred for this operation. */
5163 	ASSERT(cr != NULL);
5164 
5165 	if (sa == NULL) {
5166 		/*
5167 		 * Disconnect
5168 		 * Make sure we are connected
5169 		 */
5170 		if (icmp->icmp_state != TS_DATA_XFER)
5171 			return (EINVAL);
5172 
5173 		error = icmp_disconnect(connp);
5174 		return (error);
5175 	}
5176 
5177 	error = proto_verify_ip_addr(connp->conn_family, sa, len);
5178 	if (error != 0)
5179 		return (error);
5180 
5181 	/* do an implicit bind if necessary */
5182 	if (icmp->icmp_state == TS_UNBND) {
5183 		error = rawip_implicit_bind(connp);
5184 		/*
5185 		 * We could be racing with an actual bind, in which case
5186 		 * we would see EPROTO. We cross our fingers and try
5187 		 * to connect.
5188 		 */
5189 		if (!(error == 0 || error == EPROTO))
5190 			return (error);
5191 		did_bind = B_TRUE;
5192 	}
5193 
5194 	/*
5195 	 * set SO_DGRAM_ERRIND
5196 	 */
5197 	connp->conn_dgram_errind = B_TRUE;
5198 
5199 	error = rawip_do_connect(connp, sa, len, cr, pid);
5200 	if (error != 0 && did_bind) {
5201 		int unbind_err;
5202 
5203 		unbind_err = rawip_unbind(connp);
5204 		ASSERT(unbind_err == 0);
5205 	}
5206 
5207 	if (error == 0) {
5208 		*id = 0;
5209 		(*connp->conn_upcalls->su_connected)(connp->conn_upper_handle,
5210 		    0, NULL, -1);
5211 	} else if (error < 0) {
5212 		error = proto_tlitosyserr(-error);
5213 	}
5214 	return (error);
5215 }
5216 
5217 /* ARGSUSED2 */
5218 int
5219 rawip_fallback(sock_lower_handle_t proto_handle, queue_t *q,
5220     boolean_t direct_sockfs, so_proto_quiesced_cb_t quiesced_cb)
5221 {
5222 	conn_t  *connp = (conn_t *)proto_handle;
5223 	icmp_t	*icmp;
5224 	struct T_capability_ack tca;
5225 	struct sockaddr_in6 laddr, faddr;
5226 	socklen_t laddrlen, faddrlen;
5227 	short opts;
5228 	struct stroptions *stropt;
5229 	mblk_t *stropt_mp;
5230 	int error;
5231 
5232 	icmp = connp->conn_icmp;
5233 
5234 	stropt_mp = allocb_wait(sizeof (*stropt), BPRI_HI, STR_NOSIG, NULL);
5235 
5236 	/*
5237 	 * setup the fallback stream that was allocated
5238 	 */
5239 	connp->conn_dev = (dev_t)RD(q)->q_ptr;
5240 	connp->conn_minor_arena = WR(q)->q_ptr;
5241 
5242 	RD(q)->q_ptr = WR(q)->q_ptr = connp;
5243 
5244 	WR(q)->q_qinfo = &icmpwinit;
5245 
5246 	connp->conn_rq = RD(q);
5247 	connp->conn_wq = WR(q);
5248 
5249 	/* Notify stream head about options before sending up data */
5250 	stropt_mp->b_datap->db_type = M_SETOPTS;
5251 	stropt_mp->b_wptr += sizeof (*stropt);
5252 	stropt = (struct stroptions *)stropt_mp->b_rptr;
5253 	stropt->so_flags = SO_WROFF | SO_HIWAT;
5254 	stropt->so_wroff = connp->conn_wroff;
5255 	stropt->so_hiwat = connp->conn_rcvbuf;
5256 	putnext(RD(q), stropt_mp);
5257 
5258 	/*
5259 	 * free helper stream
5260 	 */
5261 	ip_free_helper_stream(connp);
5262 
5263 	/*
5264 	 * Collect the information needed to sync with the sonode
5265 	 */
5266 	icmp_do_capability_ack(icmp, &tca, TC1_INFO);
5267 
5268 	laddrlen = faddrlen = sizeof (sin6_t);
5269 	(void) rawip_getsockname((sock_lower_handle_t)connp,
5270 	    (struct sockaddr *)&laddr, &laddrlen, CRED());
5271 	error = rawip_getpeername((sock_lower_handle_t)connp,
5272 	    (struct sockaddr *)&faddr, &faddrlen, CRED());
5273 	if (error != 0)
5274 		faddrlen = 0;
5275 	opts = 0;
5276 	if (connp->conn_dgram_errind)
5277 		opts |= SO_DGRAM_ERRIND;
5278 	if (connp->conn_ixa->ixa_flags & IXAF_DONTROUTE)
5279 		opts |= SO_DONTROUTE;
5280 
5281 	(*quiesced_cb)(connp->conn_upper_handle, q, &tca,
5282 	    (struct sockaddr *)&laddr, laddrlen,
5283 	    (struct sockaddr *)&faddr, faddrlen, opts);
5284 
5285 	/*
5286 	 * Attempts to send data up during fallback will result in it being
5287 	 * queued in icmp_t. Now we push up any queued packets.
5288 	 */
5289 	mutex_enter(&icmp->icmp_recv_lock);
5290 	while (icmp->icmp_fallback_queue_head != NULL) {
5291 		mblk_t	*mp;
5292 
5293 		mp = icmp->icmp_fallback_queue_head;
5294 		icmp->icmp_fallback_queue_head = mp->b_next;
5295 		mp->b_next = NULL;
5296 		mutex_exit(&icmp->icmp_recv_lock);
5297 		putnext(RD(q), mp);
5298 		mutex_enter(&icmp->icmp_recv_lock);
5299 	}
5300 	icmp->icmp_fallback_queue_tail = icmp->icmp_fallback_queue_head;
5301 
5302 	/*
5303 	 * No longer a streams less socket
5304 	 */
5305 	mutex_enter(&connp->conn_lock);
5306 	connp->conn_flags &= ~IPCL_NONSTR;
5307 	mutex_exit(&connp->conn_lock);
5308 
5309 	mutex_exit(&icmp->icmp_recv_lock);
5310 
5311 	ASSERT(icmp->icmp_fallback_queue_head == NULL &&
5312 	    icmp->icmp_fallback_queue_tail == NULL);
5313 
5314 	ASSERT(connp->conn_ref >= 1);
5315 
5316 	return (0);
5317 }
5318 
5319 /* ARGSUSED2 */
5320 sock_lower_handle_t
5321 rawip_create(int family, int type, int proto, sock_downcalls_t **sock_downcalls,
5322     uint_t *smodep, int *errorp, int flags, cred_t *credp)
5323 {
5324 	conn_t *connp;
5325 
5326 	if (type != SOCK_RAW || (family != AF_INET && family != AF_INET6)) {
5327 		*errorp = EPROTONOSUPPORT;
5328 		return (NULL);
5329 	}
5330 
5331 	connp = rawip_do_open(family, credp, errorp, flags);
5332 	if (connp != NULL) {
5333 		connp->conn_flags |= IPCL_NONSTR;
5334 
5335 		mutex_enter(&connp->conn_lock);
5336 		connp->conn_state_flags &= ~CONN_INCIPIENT;
5337 		mutex_exit(&connp->conn_lock);
5338 		*sock_downcalls = &sock_rawip_downcalls;
5339 		*smodep = SM_ATOMIC;
5340 	} else {
5341 		ASSERT(*errorp != 0);
5342 	}
5343 
5344 	return ((sock_lower_handle_t)connp);
5345 }
5346 
5347 /* ARGSUSED3 */
5348 void
5349 rawip_activate(sock_lower_handle_t proto_handle,
5350     sock_upper_handle_t sock_handle, sock_upcalls_t *sock_upcalls, int flags,
5351     cred_t *cr)
5352 {
5353 	conn_t 			*connp = (conn_t *)proto_handle;
5354 	struct sock_proto_props sopp;
5355 
5356 	/* All Solaris components should pass a cred for this operation. */
5357 	ASSERT(cr != NULL);
5358 
5359 	connp->conn_upcalls = sock_upcalls;
5360 	connp->conn_upper_handle = sock_handle;
5361 
5362 	sopp.sopp_flags = SOCKOPT_WROFF | SOCKOPT_RCVHIWAT | SOCKOPT_RCVLOWAT |
5363 	    SOCKOPT_MAXBLK | SOCKOPT_MAXPSZ | SOCKOPT_MINPSZ;
5364 	sopp.sopp_wroff = connp->conn_wroff;
5365 	sopp.sopp_rxhiwat = connp->conn_rcvbuf;
5366 	sopp.sopp_rxlowat = connp->conn_rcvlowat;
5367 	sopp.sopp_maxblk = INFPSZ;
5368 	sopp.sopp_maxpsz = IP_MAXPACKET;
5369 	sopp.sopp_minpsz = (icmp_mod_info.mi_minpsz == 1) ? 0 :
5370 	    icmp_mod_info.mi_minpsz;
5371 
5372 	(*connp->conn_upcalls->su_set_proto_props)
5373 	    (connp->conn_upper_handle, &sopp);
5374 
5375 	icmp_bind_proto(connp->conn_icmp);
5376 }
5377 
5378 /* ARGSUSED3 */
5379 int
5380 rawip_getpeername(sock_lower_handle_t proto_handle, struct sockaddr *sa,
5381     socklen_t *salenp, cred_t *cr)
5382 {
5383 	conn_t  *connp = (conn_t *)proto_handle;
5384 	icmp_t  *icmp = connp->conn_icmp;
5385 	int	error;
5386 
5387 	/* All Solaris components should pass a cred for this operation. */
5388 	ASSERT(cr != NULL);
5389 
5390 	mutex_enter(&connp->conn_lock);
5391 	if (icmp->icmp_state != TS_DATA_XFER)
5392 		error = ENOTCONN;
5393 	else
5394 		error = conn_getpeername(connp, sa, salenp);
5395 	mutex_exit(&connp->conn_lock);
5396 	return (error);
5397 }
5398 
5399 /* ARGSUSED3 */
5400 int
5401 rawip_getsockname(sock_lower_handle_t proto_handle, struct sockaddr *sa,
5402     socklen_t *salenp, cred_t *cr)
5403 {
5404 	conn_t  *connp = (conn_t *)proto_handle;
5405 	int	error;
5406 
5407 	/* All Solaris components should pass a cred for this operation. */
5408 	ASSERT(cr != NULL);
5409 
5410 	mutex_enter(&connp->conn_lock);
5411 	error = conn_getsockname(connp, sa, salenp);
5412 	mutex_exit(&connp->conn_lock);
5413 	return (error);
5414 }
5415 
5416 int
5417 rawip_setsockopt(sock_lower_handle_t proto_handle, int level, int option_name,
5418     const void *optvalp, socklen_t optlen, cred_t *cr)
5419 {
5420 	conn_t	*connp = (conn_t *)proto_handle;
5421 	int error;
5422 
5423 	/* All Solaris components should pass a cred for this operation. */
5424 	ASSERT(cr != NULL);
5425 
5426 	error = proto_opt_check(level, option_name, optlen, NULL,
5427 	    icmp_opt_obj.odb_opt_des_arr,
5428 	    icmp_opt_obj.odb_opt_arr_cnt,
5429 	    B_TRUE, B_FALSE, cr);
5430 
5431 	if (error != 0) {
5432 		/*
5433 		 * option not recognized
5434 		 */
5435 		if (error < 0) {
5436 			error = proto_tlitosyserr(-error);
5437 		}
5438 		return (error);
5439 	}
5440 
5441 	error = icmp_opt_set(connp, SETFN_OPTCOM_NEGOTIATE, level,
5442 	    option_name, optlen, (uchar_t *)optvalp, (uint_t *)&optlen,
5443 	    (uchar_t *)optvalp, NULL, cr);
5444 
5445 	ASSERT(error >= 0);
5446 
5447 	return (error);
5448 }
5449 
5450 int
5451 rawip_getsockopt(sock_lower_handle_t proto_handle, int level, int option_name,
5452     void *optvalp, socklen_t *optlen, cred_t *cr)
5453 {
5454 	int		error;
5455 	conn_t		*connp = (conn_t *)proto_handle;
5456 	t_uscalar_t	max_optbuf_len;
5457 	void		*optvalp_buf;
5458 	int		len;
5459 
5460 	/* All Solaris components should pass a cred for this operation. */
5461 	ASSERT(cr != NULL);
5462 
5463 	error = proto_opt_check(level, option_name, *optlen, &max_optbuf_len,
5464 	    icmp_opt_obj.odb_opt_des_arr,
5465 	    icmp_opt_obj.odb_opt_arr_cnt,
5466 	    B_FALSE, B_TRUE, cr);
5467 
5468 	if (error != 0) {
5469 		if (error < 0) {
5470 			error = proto_tlitosyserr(-error);
5471 		}
5472 		return (error);
5473 	}
5474 
5475 	optvalp_buf = kmem_alloc(max_optbuf_len, KM_SLEEP);
5476 	len = icmp_opt_get(connp, level, option_name, optvalp_buf);
5477 	if (len == -1) {
5478 		kmem_free(optvalp_buf, max_optbuf_len);
5479 		return (EINVAL);
5480 	}
5481 
5482 	/*
5483 	 * update optlen and copy option value
5484 	 */
5485 	t_uscalar_t size = MIN(len, *optlen);
5486 
5487 	bcopy(optvalp_buf, optvalp, size);
5488 	bcopy(&size, optlen, sizeof (size));
5489 
5490 	kmem_free(optvalp_buf, max_optbuf_len);
5491 	return (0);
5492 }
5493 
5494 /* ARGSUSED1 */
5495 int
5496 rawip_close(sock_lower_handle_t proto_handle, int flags, cred_t *cr)
5497 {
5498 	conn_t	*connp = (conn_t *)proto_handle;
5499 
5500 	/* All Solaris components should pass a cred for this operation. */
5501 	ASSERT(cr != NULL);
5502 
5503 	(void) rawip_do_close(connp);
5504 	return (0);
5505 }
5506 
5507 /* ARGSUSED2 */
5508 int
5509 rawip_shutdown(sock_lower_handle_t proto_handle, int how, cred_t *cr)
5510 {
5511 	conn_t  *connp = (conn_t *)proto_handle;
5512 
5513 	/* All Solaris components should pass a cred for this operation. */
5514 	ASSERT(cr != NULL);
5515 
5516 	/* shut down the send side */
5517 	if (how != SHUT_RD)
5518 		(*connp->conn_upcalls->su_opctl)(connp->conn_upper_handle,
5519 		    SOCK_OPCTL_SHUT_SEND, 0);
5520 	/* shut down the recv side */
5521 	if (how != SHUT_WR)
5522 		(*connp->conn_upcalls->su_opctl)(connp->conn_upper_handle,
5523 		    SOCK_OPCTL_SHUT_RECV, 0);
5524 	return (0);
5525 }
5526 
5527 void
5528 rawip_clr_flowctrl(sock_lower_handle_t proto_handle)
5529 {
5530 	conn_t  *connp = (conn_t *)proto_handle;
5531 	icmp_t	*icmp = connp->conn_icmp;
5532 
5533 	mutex_enter(&icmp->icmp_recv_lock);
5534 	connp->conn_flow_cntrld = B_FALSE;
5535 	mutex_exit(&icmp->icmp_recv_lock);
5536 }
5537 
5538 int
5539 rawip_ioctl(sock_lower_handle_t proto_handle, int cmd, intptr_t arg,
5540     int mode, int32_t *rvalp, cred_t *cr)
5541 {
5542 	conn_t  	*connp = (conn_t *)proto_handle;
5543 	int		error;
5544 
5545 	/* All Solaris components should pass a cred for this operation. */
5546 	ASSERT(cr != NULL);
5547 
5548 	/*
5549 	 * If we don't have a helper stream then create one.
5550 	 * ip_create_helper_stream takes care of locking the conn_t,
5551 	 * so this check for NULL is just a performance optimization.
5552 	 */
5553 	if (connp->conn_helper_info == NULL) {
5554 		icmp_stack_t *is = connp->conn_icmp->icmp_is;
5555 
5556 		ASSERT(is->is_ldi_ident != NULL);
5557 
5558 		/*
5559 		 * Create a helper stream for non-STREAMS socket.
5560 		 */
5561 		error = ip_create_helper_stream(connp, is->is_ldi_ident);
5562 		if (error != 0) {
5563 			ip0dbg(("rawip_ioctl: create of IP helper stream "
5564 			    "failed %d\n", error));
5565 			return (error);
5566 		}
5567 	}
5568 
5569 	switch (cmd) {
5570 	case _SIOCSOCKFALLBACK:
5571 	case TI_GETPEERNAME:
5572 	case TI_GETMYNAME:
5573 #ifdef DEBUG
5574 		cmn_err(CE_CONT, "icmp_ioctl cmd 0x%x on non streams"
5575 		    " socket", cmd);
5576 #endif
5577 		error = EINVAL;
5578 		break;
5579 	default:
5580 		/*
5581 		 * Pass on to IP using helper stream
5582 		 */
5583 		error = ldi_ioctl(connp->conn_helper_info->iphs_handle,
5584 		    cmd, arg, mode, cr, rvalp);
5585 		break;
5586 	}
5587 	return (error);
5588 }
5589 
5590 int
5591 rawip_send(sock_lower_handle_t proto_handle, mblk_t *mp, struct nmsghdr *msg,
5592     cred_t *cr)
5593 {
5594 	sin6_t		*sin6;
5595 	sin_t		*sin = NULL;
5596 	uint_t		srcid;
5597 	conn_t		*connp = (conn_t *)proto_handle;
5598 	icmp_t		*icmp = connp->conn_icmp;
5599 	int		error = 0;
5600 	icmp_stack_t	*is = icmp->icmp_is;
5601 	pid_t		pid = curproc->p_pid;
5602 	ip_xmit_attr_t	*ixa;
5603 
5604 	ASSERT(DB_TYPE(mp) == M_DATA);
5605 
5606 	/* All Solaris components should pass a cred for this operation. */
5607 	ASSERT(cr != NULL);
5608 
5609 	/* do an implicit bind if necessary */
5610 	if (icmp->icmp_state == TS_UNBND) {
5611 		error = rawip_implicit_bind(connp);
5612 		/*
5613 		 * We could be racing with an actual bind, in which case
5614 		 * we would see EPROTO. We cross our fingers and try
5615 		 * to connect.
5616 		 */
5617 		if (!(error == 0 || error == EPROTO)) {
5618 			freemsg(mp);
5619 			return (error);
5620 		}
5621 	}
5622 
5623 	/* Protocol 255 contains full IP headers */
5624 	/* Read without holding lock */
5625 	if (icmp->icmp_hdrincl) {
5626 		ASSERT(connp->conn_ipversion == IPV4_VERSION);
5627 		if (mp->b_wptr - mp->b_rptr < IP_SIMPLE_HDR_LENGTH) {
5628 			if (!pullupmsg(mp, IP_SIMPLE_HDR_LENGTH)) {
5629 				BUMP_MIB(&is->is_rawip_mib, rawipOutErrors);
5630 				freemsg(mp);
5631 				return (EINVAL);
5632 			}
5633 		}
5634 		error = icmp_output_hdrincl(connp, mp, cr, pid);
5635 		if (is->is_sendto_ignerr)
5636 			return (0);
5637 		else
5638 			return (error);
5639 	}
5640 
5641 	/* Connected? */
5642 	if (msg->msg_name == NULL) {
5643 		if (icmp->icmp_state != TS_DATA_XFER) {
5644 			BUMP_MIB(&is->is_rawip_mib, rawipOutErrors);
5645 			return (EDESTADDRREQ);
5646 		}
5647 		if (msg->msg_controllen != 0) {
5648 			error = icmp_output_ancillary(connp, NULL, NULL, mp,
5649 			    NULL, msg, cr, pid);
5650 		} else {
5651 			error = icmp_output_connected(connp, mp, cr, pid);
5652 		}
5653 		if (is->is_sendto_ignerr)
5654 			return (0);
5655 		else
5656 			return (error);
5657 	}
5658 	if (icmp->icmp_state == TS_DATA_XFER) {
5659 		BUMP_MIB(&is->is_rawip_mib, rawipOutErrors);
5660 		return (EISCONN);
5661 	}
5662 	error = proto_verify_ip_addr(connp->conn_family,
5663 	    (struct sockaddr *)msg->msg_name, msg->msg_namelen);
5664 	if (error != 0) {
5665 		BUMP_MIB(&is->is_rawip_mib, rawipOutErrors);
5666 		return (error);
5667 	}
5668 	switch (connp->conn_family) {
5669 	case AF_INET6:
5670 		sin6 = (sin6_t *)msg->msg_name;
5671 
5672 		/* No support for mapped addresses on raw sockets */
5673 		if (IN6_IS_ADDR_V4MAPPED(&sin6->sin6_addr)) {
5674 			BUMP_MIB(&is->is_rawip_mib, rawipOutErrors);
5675 			return (EADDRNOTAVAIL);
5676 		}
5677 		srcid = sin6->__sin6_src_id;
5678 
5679 		/*
5680 		 * If the local address is a mapped address return
5681 		 * an error.
5682 		 * It would be possible to send an IPv6 packet but the
5683 		 * response would never make it back to the application
5684 		 * since it is bound to a mapped address.
5685 		 */
5686 		if (IN6_IS_ADDR_V4MAPPED(&connp->conn_saddr_v6)) {
5687 			BUMP_MIB(&is->is_rawip_mib, rawipOutErrors);
5688 			return (EADDRNOTAVAIL);
5689 		}
5690 
5691 		if (IN6_IS_ADDR_UNSPECIFIED(&sin6->sin6_addr))
5692 			sin6->sin6_addr = ipv6_loopback;
5693 
5694 		/*
5695 		 * We have to allocate an ip_xmit_attr_t before we grab
5696 		 * conn_lock and we need to hold conn_lock once we've check
5697 		 * conn_same_as_last_v6 to handle concurrent send* calls on a
5698 		 * socket.
5699 		 */
5700 		if (msg->msg_controllen == 0) {
5701 			ixa = conn_get_ixa(connp, B_FALSE);
5702 			if (ixa == NULL) {
5703 				BUMP_MIB(&is->is_rawip_mib, rawipOutErrors);
5704 				return (ENOMEM);
5705 			}
5706 		} else {
5707 			ixa = NULL;
5708 		}
5709 		mutex_enter(&connp->conn_lock);
5710 		if (icmp->icmp_delayed_error != 0) {
5711 			sin6_t  *sin2 = (sin6_t *)&icmp->icmp_delayed_addr;
5712 
5713 			error = icmp->icmp_delayed_error;
5714 			icmp->icmp_delayed_error = 0;
5715 
5716 			/* Compare IP address and family */
5717 
5718 			if (IN6_ARE_ADDR_EQUAL(&sin6->sin6_addr,
5719 			    &sin2->sin6_addr) &&
5720 			    sin6->sin6_family == sin2->sin6_family) {
5721 				mutex_exit(&connp->conn_lock);
5722 				BUMP_MIB(&is->is_rawip_mib, rawipOutErrors);
5723 				if (ixa != NULL)
5724 					ixa_refrele(ixa);
5725 				return (error);
5726 			}
5727 		}
5728 		if (msg->msg_controllen != 0) {
5729 			mutex_exit(&connp->conn_lock);
5730 			ASSERT(ixa == NULL);
5731 			error = icmp_output_ancillary(connp, NULL, sin6, mp,
5732 			    NULL, msg, cr, pid);
5733 		} else if (conn_same_as_last_v6(connp, sin6) &&
5734 		    connp->conn_lastsrcid == srcid &&
5735 		    ipsec_outbound_policy_current(ixa)) {
5736 			/* icmp_output_lastdst drops conn_lock */
5737 			error = icmp_output_lastdst(connp, mp, cr, pid, ixa);
5738 		} else {
5739 			/* icmp_output_newdst drops conn_lock */
5740 			error = icmp_output_newdst(connp, mp, NULL, sin6, cr,
5741 			    pid, ixa);
5742 		}
5743 		ASSERT(MUTEX_NOT_HELD(&connp->conn_lock));
5744 		if (is->is_sendto_ignerr)
5745 			return (0);
5746 		else
5747 			return (error);
5748 	case AF_INET:
5749 		sin = (sin_t *)msg->msg_name;
5750 
5751 		if (sin->sin_addr.s_addr == INADDR_ANY)
5752 			sin->sin_addr.s_addr = htonl(INADDR_LOOPBACK);
5753 
5754 		/*
5755 		 * We have to allocate an ip_xmit_attr_t before we grab
5756 		 * conn_lock and we need to hold conn_lock once we've check
5757 		 * conn_same_as_last_v6 to handle concurrent send* on a socket.
5758 		 */
5759 		if (msg->msg_controllen == 0) {
5760 			ixa = conn_get_ixa(connp, B_FALSE);
5761 			if (ixa == NULL) {
5762 				BUMP_MIB(&is->is_rawip_mib, rawipOutErrors);
5763 				return (ENOMEM);
5764 			}
5765 		} else {
5766 			ixa = NULL;
5767 		}
5768 		mutex_enter(&connp->conn_lock);
5769 		if (icmp->icmp_delayed_error != 0) {
5770 			sin_t  *sin2 = (sin_t *)&icmp->icmp_delayed_addr;
5771 
5772 			error = icmp->icmp_delayed_error;
5773 			icmp->icmp_delayed_error = 0;
5774 
5775 			/* Compare IP address */
5776 
5777 			if (sin->sin_addr.s_addr == sin2->sin_addr.s_addr) {
5778 				mutex_exit(&connp->conn_lock);
5779 				BUMP_MIB(&is->is_rawip_mib, rawipOutErrors);
5780 				if (ixa != NULL)
5781 					ixa_refrele(ixa);
5782 				return (error);
5783 			}
5784 		}
5785 
5786 		if (msg->msg_controllen != 0) {
5787 			mutex_exit(&connp->conn_lock);
5788 			ASSERT(ixa == NULL);
5789 			error = icmp_output_ancillary(connp, sin, NULL, mp,
5790 			    NULL, msg, cr, pid);
5791 		} else if (conn_same_as_last_v4(connp, sin) &&
5792 		    ipsec_outbound_policy_current(ixa)) {
5793 			/* icmp_output_lastdst drops conn_lock */
5794 			error = icmp_output_lastdst(connp, mp, cr, pid, ixa);
5795 		} else {
5796 			/* icmp_output_newdst drops conn_lock */
5797 			error = icmp_output_newdst(connp, mp, sin, NULL, cr,
5798 			    pid, ixa);
5799 		}
5800 		ASSERT(MUTEX_NOT_HELD(&connp->conn_lock));
5801 		if (is->is_sendto_ignerr)
5802 			return (0);
5803 		else
5804 			return (error);
5805 	default:
5806 		return (EINVAL);
5807 	}
5808 }
5809 
5810 sock_downcalls_t sock_rawip_downcalls = {
5811 	rawip_activate,
5812 	rawip_accept,
5813 	rawip_bind,
5814 	rawip_listen,
5815 	rawip_connect,
5816 	rawip_getpeername,
5817 	rawip_getsockname,
5818 	rawip_getsockopt,
5819 	rawip_setsockopt,
5820 	rawip_send,
5821 	NULL,
5822 	NULL,
5823 	NULL,
5824 	rawip_shutdown,
5825 	rawip_clr_flowctrl,
5826 	rawip_ioctl,
5827 	rawip_close
5828 };
5829