xref: /titanic_44/usr/src/uts/common/inet/ip/icmp.c (revision f0b62587229842fad8c5df20795bf9bca17327bd)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 /* Copyright (c) 1990 Mentat Inc. */
26 
27 #include <sys/types.h>
28 #include <sys/stream.h>
29 #include <sys/stropts.h>
30 #include <sys/strlog.h>
31 #include <sys/strsun.h>
32 #define	_SUN_TPI_VERSION 2
33 #include <sys/tihdr.h>
34 #include <sys/timod.h>
35 #include <sys/ddi.h>
36 #include <sys/sunddi.h>
37 #include <sys/strsubr.h>
38 #include <sys/suntpi.h>
39 #include <sys/xti_inet.h>
40 #include <sys/cmn_err.h>
41 #include <sys/kmem.h>
42 #include <sys/cred_impl.h>
43 #include <sys/policy.h>
44 #include <sys/priv.h>
45 #include <sys/ucred.h>
46 #include <sys/zone.h>
47 
48 #include <sys/sockio.h>
49 #include <sys/socket.h>
50 #include <sys/socketvar.h>
51 #include <sys/vtrace.h>
52 #include <sys/sdt.h>
53 #include <sys/debug.h>
54 #include <sys/isa_defs.h>
55 #include <sys/random.h>
56 #include <netinet/in.h>
57 #include <netinet/ip6.h>
58 #include <netinet/icmp6.h>
59 #include <netinet/udp.h>
60 
61 #include <inet/common.h>
62 #include <inet/ip.h>
63 #include <inet/ip_impl.h>
64 #include <inet/ipsec_impl.h>
65 #include <inet/ip6.h>
66 #include <inet/ip_ire.h>
67 #include <inet/ip_if.h>
68 #include <inet/ip_multi.h>
69 #include <inet/ip_ndp.h>
70 #include <inet/proto_set.h>
71 #include <inet/mib2.h>
72 #include <inet/nd.h>
73 #include <inet/optcom.h>
74 #include <inet/snmpcom.h>
75 #include <inet/kstatcom.h>
76 #include <inet/ipclassifier.h>
77 
78 #include <sys/tsol/label.h>
79 #include <sys/tsol/tnet.h>
80 
81 #include <inet/rawip_impl.h>
82 
83 #include <sys/disp.h>
84 
85 /*
86  * Synchronization notes:
87  *
88  * RAWIP is MT and uses the usual kernel synchronization primitives. We use
89  * conn_lock to protect the icmp_t.
90  *
91  * Plumbing notes:
92  * ICMP is always a device driver. For compatibility with mibopen() code
93  * it is possible to I_PUSH "icmp", but that results in pushing a passthrough
94  * dummy module.
95  */
96 
97 static void	icmp_addr_req(queue_t *q, mblk_t *mp);
98 static void	icmp_tpi_bind(queue_t *q, mblk_t *mp);
99 static void	icmp_bind_proto(icmp_t *icmp);
100 static int	icmp_build_hdr_template(conn_t *, const in6_addr_t *,
101     const in6_addr_t *, uint32_t);
102 static void	icmp_capability_req(queue_t *q, mblk_t *mp);
103 static int	icmp_close(queue_t *q, int flags);
104 static void	icmp_close_free(conn_t *);
105 static void	icmp_tpi_connect(queue_t *q, mblk_t *mp);
106 static void	icmp_tpi_disconnect(queue_t *q, mblk_t *mp);
107 static void	icmp_err_ack(queue_t *q, mblk_t *mp, t_scalar_t t_error,
108     int sys_error);
109 static void	icmp_err_ack_prim(queue_t *q, mblk_t *mp, t_scalar_t primitive,
110     t_scalar_t tlierr, int sys_error);
111 static void	icmp_icmp_input(void *arg1, mblk_t *mp, void *arg2,
112     ip_recv_attr_t *);
113 static void	icmp_icmp_error_ipv6(conn_t *connp, mblk_t *mp,
114     ip_recv_attr_t *);
115 static void	icmp_info_req(queue_t *q, mblk_t *mp);
116 static void	icmp_input(void *, mblk_t *, void *, ip_recv_attr_t *);
117 static conn_t 	*icmp_open(int family, cred_t *credp, int *err, int flags);
118 static int	icmp_openv4(queue_t *q, dev_t *devp, int flag, int sflag,
119 		    cred_t *credp);
120 static int	icmp_openv6(queue_t *q, dev_t *devp, int flag, int sflag,
121 		    cred_t *credp);
122 static boolean_t icmp_opt_allow_udr_set(t_scalar_t level, t_scalar_t name);
123 int		icmp_opt_set(conn_t *connp, uint_t optset_context,
124 		    int level, int name, uint_t inlen,
125 		    uchar_t *invalp, uint_t *outlenp, uchar_t *outvalp,
126 		    void *thisdg_attrs, cred_t *cr);
127 int		icmp_opt_get(conn_t *connp, int level, int name,
128 		    uchar_t *ptr);
129 static int	icmp_output_newdst(conn_t *connp, mblk_t *data_mp, sin_t *sin,
130 		    sin6_t *sin6, cred_t *cr, pid_t pid, ip_xmit_attr_t *ixa);
131 static int	icmp_param_get(queue_t *q, mblk_t *mp, caddr_t cp, cred_t *cr);
132 static boolean_t icmp_param_register(IDP *ndp, icmpparam_t *icmppa, int cnt);
133 static int	icmp_param_set(queue_t *q, mblk_t *mp, char *value,
134 		    caddr_t cp, cred_t *cr);
135 static mblk_t	*icmp_prepend_hdr(conn_t *, ip_xmit_attr_t *, const ip_pkt_t *,
136     const in6_addr_t *, const in6_addr_t *, uint32_t, mblk_t *, int *);
137 static mblk_t	*icmp_prepend_header_template(conn_t *, ip_xmit_attr_t *,
138     mblk_t *, const in6_addr_t *, uint32_t, int *);
139 static int	icmp_snmp_set(queue_t *q, t_scalar_t level, t_scalar_t name,
140 		    uchar_t *ptr, int len);
141 static void	icmp_ud_err(queue_t *q, mblk_t *mp, t_scalar_t err);
142 static void	icmp_tpi_unbind(queue_t *q, mblk_t *mp);
143 static void	icmp_wput(queue_t *q, mblk_t *mp);
144 static void	icmp_wput_fallback(queue_t *q, mblk_t *mp);
145 static void	icmp_wput_other(queue_t *q, mblk_t *mp);
146 static void	icmp_wput_iocdata(queue_t *q, mblk_t *mp);
147 static void	icmp_wput_restricted(queue_t *q, mblk_t *mp);
148 static void	icmp_ulp_recv(conn_t *, mblk_t *, uint_t);
149 
150 static void	*rawip_stack_init(netstackid_t stackid, netstack_t *ns);
151 static void	rawip_stack_fini(netstackid_t stackid, void *arg);
152 
153 static void	*rawip_kstat_init(netstackid_t stackid);
154 static void	rawip_kstat_fini(netstackid_t stackid, kstat_t *ksp);
155 static int	rawip_kstat_update(kstat_t *kp, int rw);
156 static void	rawip_stack_shutdown(netstackid_t stackid, void *arg);
157 
158 /* Common routines for TPI and socket module */
159 static conn_t	*rawip_do_open(int, cred_t *, int *, int);
160 static void	rawip_do_close(conn_t *);
161 static int	rawip_do_bind(conn_t *, struct sockaddr *, socklen_t);
162 static int	rawip_do_unbind(conn_t *);
163 static int	rawip_do_connect(conn_t *, const struct sockaddr *, socklen_t,
164     cred_t *, pid_t);
165 
166 int		rawip_getsockname(sock_lower_handle_t, struct sockaddr *,
167 		    socklen_t *, cred_t *);
168 int		rawip_getpeername(sock_lower_handle_t, struct sockaddr *,
169 		    socklen_t *, cred_t *);
170 
171 static struct module_info icmp_mod_info =  {
172 	5707, "icmp", 1, INFPSZ, 512, 128
173 };
174 
175 /*
176  * Entry points for ICMP as a device.
177  * We have separate open functions for the /dev/icmp and /dev/icmp6 devices.
178  */
179 static struct qinit icmprinitv4 = {
180 	NULL, NULL, icmp_openv4, icmp_close, NULL, &icmp_mod_info
181 };
182 
183 static struct qinit icmprinitv6 = {
184 	NULL, NULL, icmp_openv6, icmp_close, NULL, &icmp_mod_info
185 };
186 
187 static struct qinit icmpwinit = {
188 	(pfi_t)icmp_wput, (pfi_t)ip_wsrv, NULL, NULL, NULL, &icmp_mod_info
189 };
190 
191 /* ICMP entry point during fallback */
192 static struct qinit icmp_fallback_sock_winit = {
193 	(pfi_t)icmp_wput_fallback, NULL, NULL, NULL, NULL, &icmp_mod_info
194 };
195 
196 /* For AF_INET aka /dev/icmp */
197 struct streamtab icmpinfov4 = {
198 	&icmprinitv4, &icmpwinit
199 };
200 
201 /* For AF_INET6 aka /dev/icmp6 */
202 struct streamtab icmpinfov6 = {
203 	&icmprinitv6, &icmpwinit
204 };
205 
206 static sin_t	sin_null;	/* Zero address for quick clears */
207 static sin6_t	sin6_null;	/* Zero address for quick clears */
208 
209 /* Default structure copied into T_INFO_ACK messages */
210 static struct T_info_ack icmp_g_t_info_ack = {
211 	T_INFO_ACK,
212 	IP_MAXPACKET,	 /* TSDU_size.  icmp allows maximum size messages. */
213 	T_INVALID,	/* ETSDU_size.  icmp does not support expedited data. */
214 	T_INVALID,	/* CDATA_size. icmp does not support connect data. */
215 	T_INVALID,	/* DDATA_size. icmp does not support disconnect data. */
216 	0,		/* ADDR_size - filled in later. */
217 	0,		/* OPT_size - not initialized here */
218 	IP_MAXPACKET,	/* TIDU_size.  icmp allows maximum size messages. */
219 	T_CLTS,		/* SERV_type.  icmp supports connection-less. */
220 	TS_UNBND,	/* CURRENT_state.  This is set from icmp_state. */
221 	(XPG4_1|SENDZERO) /* PROVIDER_flag */
222 };
223 
224 /*
225  * Table of ND variables supported by icmp.  These are loaded into is_nd
226  * when the stack instance is created.
227  * All of these are alterable, within the min/max values given, at run time.
228  */
229 static icmpparam_t	icmp_param_arr[] = {
230 	/* min	max	value	name */
231 	{ 0,	128,	32,	"icmp_wroff_extra" },
232 	{ 1,	255,	255,	"icmp_ipv4_ttl" },
233 	{ 0, IPV6_MAX_HOPS, IPV6_DEFAULT_HOPS,	"icmp_ipv6_hoplimit"},
234 	{ 0,	1,	1,	"icmp_bsd_compat" },
235 	{ 4096,	65536,	8192,	"icmp_xmit_hiwat"},
236 	{ 0,	65536,	1024,	"icmp_xmit_lowat"},
237 	{ 4096,	65536,	8192,	"icmp_recv_hiwat"},
238 	{ 65536, 1024*1024*1024, 256*1024,	"icmp_max_buf"},
239 	{ 0,	1,	0,	"icmp_pmtu_discovery" },
240 	{ 0,	1,	0,	"icmp_sendto_ignerr" },
241 };
242 #define	is_wroff_extra			is_param_arr[0].icmp_param_value
243 #define	is_ipv4_ttl			is_param_arr[1].icmp_param_value
244 #define	is_ipv6_hoplimit		is_param_arr[2].icmp_param_value
245 #define	is_bsd_compat			is_param_arr[3].icmp_param_value
246 #define	is_xmit_hiwat			is_param_arr[4].icmp_param_value
247 #define	is_xmit_lowat			is_param_arr[5].icmp_param_value
248 #define	is_recv_hiwat			is_param_arr[6].icmp_param_value
249 #define	is_max_buf			is_param_arr[7].icmp_param_value
250 #define	is_pmtu_discovery		is_param_arr[8].icmp_param_value
251 #define	is_sendto_ignerr		is_param_arr[9].icmp_param_value
252 
253 typedef union T_primitives *t_primp_t;
254 
255 /*
256  * This routine is called to handle each O_T_BIND_REQ/T_BIND_REQ message
257  * passed to icmp_wput.
258  * It calls IP to verify the local IP address, and calls IP to insert
259  * the conn_t in the fanout table.
260  * If everything is ok it then sends the T_BIND_ACK back up.
261  */
262 static void
263 icmp_tpi_bind(queue_t *q, mblk_t *mp)
264 {
265 	int	error;
266 	struct sockaddr *sa;
267 	struct T_bind_req *tbr;
268 	socklen_t	len;
269 	sin_t	*sin;
270 	sin6_t	*sin6;
271 	icmp_t		*icmp;
272 	conn_t	*connp = Q_TO_CONN(q);
273 	mblk_t *mp1;
274 	cred_t *cr;
275 
276 	/*
277 	 * All Solaris components should pass a db_credp
278 	 * for this TPI message, hence we ASSERT.
279 	 * But in case there is some other M_PROTO that looks
280 	 * like a TPI message sent by some other kernel
281 	 * component, we check and return an error.
282 	 */
283 	cr = msg_getcred(mp, NULL);
284 	ASSERT(cr != NULL);
285 	if (cr == NULL) {
286 		icmp_err_ack(q, mp, TSYSERR, EINVAL);
287 		return;
288 	}
289 
290 	icmp = connp->conn_icmp;
291 	if ((mp->b_wptr - mp->b_rptr) < sizeof (*tbr)) {
292 		(void) mi_strlog(q, 1, SL_ERROR|SL_TRACE,
293 		    "icmp_bind: bad req, len %u",
294 		    (uint_t)(mp->b_wptr - mp->b_rptr));
295 		icmp_err_ack(q, mp, TPROTO, 0);
296 		return;
297 	}
298 
299 	if (icmp->icmp_state != TS_UNBND) {
300 		(void) mi_strlog(q, 1, SL_ERROR|SL_TRACE,
301 		    "icmp_bind: bad state, %u", icmp->icmp_state);
302 		icmp_err_ack(q, mp, TOUTSTATE, 0);
303 		return;
304 	}
305 
306 	/*
307 	 * Reallocate the message to make sure we have enough room for an
308 	 * address.
309 	 */
310 	mp1 = reallocb(mp, sizeof (struct T_bind_ack) + sizeof (sin6_t), 1);
311 	if (mp1 == NULL) {
312 		icmp_err_ack(q, mp, TSYSERR, ENOMEM);
313 		return;
314 	}
315 	mp = mp1;
316 
317 	/* Reset the message type in preparation for shipping it back. */
318 	DB_TYPE(mp) = M_PCPROTO;
319 	tbr = (struct T_bind_req *)mp->b_rptr;
320 	len = tbr->ADDR_length;
321 	switch (len) {
322 	case 0:	/* request for a generic port */
323 		tbr->ADDR_offset = sizeof (struct T_bind_req);
324 		if (connp->conn_family == AF_INET) {
325 			tbr->ADDR_length = sizeof (sin_t);
326 			sin = (sin_t *)&tbr[1];
327 			*sin = sin_null;
328 			sin->sin_family = AF_INET;
329 			mp->b_wptr = (uchar_t *)&sin[1];
330 			sa = (struct sockaddr *)sin;
331 			len = sizeof (sin_t);
332 		} else {
333 			ASSERT(connp->conn_family == AF_INET6);
334 			tbr->ADDR_length = sizeof (sin6_t);
335 			sin6 = (sin6_t *)&tbr[1];
336 			*sin6 = sin6_null;
337 			sin6->sin6_family = AF_INET6;
338 			mp->b_wptr = (uchar_t *)&sin6[1];
339 			sa = (struct sockaddr *)sin6;
340 			len = sizeof (sin6_t);
341 		}
342 		break;
343 
344 	case sizeof (sin_t):	/* Complete IPv4 address */
345 		sa = (struct sockaddr *)mi_offset_param(mp, tbr->ADDR_offset,
346 		    sizeof (sin_t));
347 		break;
348 
349 	case sizeof (sin6_t):	/* Complete IPv6 address */
350 		sa = (struct sockaddr *)mi_offset_param(mp,
351 		    tbr->ADDR_offset, sizeof (sin6_t));
352 		break;
353 
354 	default:
355 		(void) mi_strlog(q, 1, SL_ERROR|SL_TRACE,
356 		    "icmp_bind: bad ADDR_length %u", tbr->ADDR_length);
357 		icmp_err_ack(q, mp, TBADADDR, 0);
358 		return;
359 	}
360 
361 	error = rawip_do_bind(connp, sa, len);
362 	if (error != 0) {
363 		if (error > 0) {
364 			icmp_err_ack(q, mp, TSYSERR, error);
365 		} else {
366 			icmp_err_ack(q, mp, -error, 0);
367 		}
368 	} else {
369 		tbr->PRIM_type = T_BIND_ACK;
370 		qreply(q, mp);
371 	}
372 }
373 
374 static int
375 rawip_do_bind(conn_t *connp, struct sockaddr *sa, socklen_t len)
376 {
377 	sin_t		*sin;
378 	sin6_t		*sin6;
379 	icmp_t		*icmp = connp->conn_icmp;
380 	int		error = 0;
381 	ip_laddr_t	laddr_type = IPVL_UNICAST_UP;	/* INADDR_ANY */
382 	in_port_t	lport;		/* Network byte order */
383 	ipaddr_t	v4src;		/* Set if AF_INET */
384 	in6_addr_t	v6src;
385 	uint_t		scopeid = 0;
386 	zoneid_t	zoneid = IPCL_ZONEID(connp);
387 	ip_stack_t	*ipst = connp->conn_netstack->netstack_ip;
388 
389 	if (sa == NULL || !OK_32PTR((char *)sa)) {
390 		return (EINVAL);
391 	}
392 
393 	switch (len) {
394 	case sizeof (sin_t):    /* Complete IPv4 address */
395 		sin = (sin_t *)sa;
396 		if (sin->sin_family != AF_INET ||
397 		    connp->conn_family != AF_INET) {
398 			/* TSYSERR, EAFNOSUPPORT */
399 			return (EAFNOSUPPORT);
400 		}
401 		v4src = sin->sin_addr.s_addr;
402 		IN6_IPADDR_TO_V4MAPPED(v4src, &v6src);
403 		if (v4src != INADDR_ANY) {
404 			laddr_type = ip_laddr_verify_v4(v4src, zoneid, ipst,
405 			    B_TRUE);
406 		}
407 		lport = sin->sin_port;
408 		break;
409 	case sizeof (sin6_t): /* Complete IPv6 address */
410 		sin6 = (sin6_t *)sa;
411 		if (sin6->sin6_family != AF_INET6 ||
412 		    connp->conn_family != AF_INET6) {
413 			/* TSYSERR, EAFNOSUPPORT */
414 			return (EAFNOSUPPORT);
415 		}
416 		/* No support for mapped addresses on raw sockets */
417 		if (IN6_IS_ADDR_V4MAPPED(&sin6->sin6_addr)) {
418 			/* TSYSERR, EADDRNOTAVAIL */
419 			return (EADDRNOTAVAIL);
420 		}
421 		v6src = sin6->sin6_addr;
422 		if (!IN6_IS_ADDR_UNSPECIFIED(&v6src)) {
423 			if (IN6_IS_ADDR_LINKSCOPE(&v6src))
424 				scopeid = sin6->sin6_scope_id;
425 			laddr_type = ip_laddr_verify_v6(&v6src, zoneid, ipst,
426 			    B_TRUE, scopeid);
427 		}
428 		lport = sin6->sin6_port;
429 		break;
430 
431 	default:
432 		/* TBADADDR */
433 		return (EADDRNOTAVAIL);
434 	}
435 
436 	/* Is the local address a valid unicast, multicast, or broadcast? */
437 	if (laddr_type == IPVL_BAD)
438 		return (EADDRNOTAVAIL);
439 
440 	/*
441 	 * The state must be TS_UNBND.
442 	 */
443 	mutex_enter(&connp->conn_lock);
444 	if (icmp->icmp_state != TS_UNBND) {
445 		mutex_exit(&connp->conn_lock);
446 		return (-TOUTSTATE);
447 	}
448 
449 	/*
450 	 * Copy the source address into our icmp structure.  This address
451 	 * may still be zero; if so, ip will fill in the correct address
452 	 * each time an outbound packet is passed to it.
453 	 * If we are binding to a broadcast or multicast address then
454 	 * we just set the conn_bound_addr since we don't want to use
455 	 * that as the source address when sending.
456 	 */
457 	connp->conn_bound_addr_v6 = v6src;
458 	connp->conn_laddr_v6 = v6src;
459 	if (scopeid != 0) {
460 		connp->conn_ixa->ixa_flags |= IXAF_SCOPEID_SET;
461 		connp->conn_ixa->ixa_scopeid = scopeid;
462 		connp->conn_incoming_ifindex = scopeid;
463 	} else {
464 		connp->conn_ixa->ixa_flags &= ~IXAF_SCOPEID_SET;
465 		connp->conn_incoming_ifindex = connp->conn_bound_if;
466 	}
467 
468 	switch (laddr_type) {
469 	case IPVL_UNICAST_UP:
470 	case IPVL_UNICAST_DOWN:
471 		connp->conn_saddr_v6 = v6src;
472 		connp->conn_mcbc_bind = B_FALSE;
473 		break;
474 	case IPVL_MCAST:
475 	case IPVL_BCAST:
476 		/* ip_set_destination will pick a source address later */
477 		connp->conn_saddr_v6 = ipv6_all_zeros;
478 		connp->conn_mcbc_bind = B_TRUE;
479 		break;
480 	}
481 
482 	/* Any errors after this point should use late_error */
483 
484 	/*
485 	 * Use sin_port/sin6_port since applications like psh use SOCK_RAW
486 	 * with IPPROTO_TCP.
487 	 */
488 	connp->conn_lport = lport;
489 	connp->conn_fport = 0;
490 
491 	if (connp->conn_family == AF_INET) {
492 		ASSERT(connp->conn_ipversion == IPV4_VERSION);
493 	} else {
494 		ASSERT(connp->conn_ipversion == IPV6_VERSION);
495 	}
496 
497 	icmp->icmp_state = TS_IDLE;
498 
499 	/*
500 	 * We create an initial header template here to make a subsequent
501 	 * sendto have a starting point. Since conn_last_dst is zero the
502 	 * first sendto will always follow the 'dst changed' code path.
503 	 * Note that we defer massaging options and the related checksum
504 	 * adjustment until we have a destination address.
505 	 */
506 	error = icmp_build_hdr_template(connp, &connp->conn_saddr_v6,
507 	    &connp->conn_faddr_v6, connp->conn_flowinfo);
508 	if (error != 0) {
509 		mutex_exit(&connp->conn_lock);
510 		goto late_error;
511 	}
512 	/* Just in case */
513 	connp->conn_faddr_v6 = ipv6_all_zeros;
514 	connp->conn_v6lastdst = ipv6_all_zeros;
515 	mutex_exit(&connp->conn_lock);
516 
517 	error = ip_laddr_fanout_insert(connp);
518 	if (error != 0)
519 		goto late_error;
520 
521 	/* Bind succeeded */
522 	return (0);
523 
524 late_error:
525 	mutex_enter(&connp->conn_lock);
526 	connp->conn_saddr_v6 = ipv6_all_zeros;
527 	connp->conn_bound_addr_v6 = ipv6_all_zeros;
528 	connp->conn_laddr_v6 = ipv6_all_zeros;
529 	if (scopeid != 0) {
530 		connp->conn_ixa->ixa_flags &= ~IXAF_SCOPEID_SET;
531 		connp->conn_incoming_ifindex = connp->conn_bound_if;
532 	}
533 	icmp->icmp_state = TS_UNBND;
534 	connp->conn_v6lastdst = ipv6_all_zeros;
535 	connp->conn_lport = 0;
536 
537 	/* Restore the header that was built above - different source address */
538 	(void) icmp_build_hdr_template(connp, &connp->conn_saddr_v6,
539 	    &connp->conn_faddr_v6, connp->conn_flowinfo);
540 	mutex_exit(&connp->conn_lock);
541 	return (error);
542 }
543 
544 /*
545  * Tell IP to just bind to the protocol.
546  */
547 static void
548 icmp_bind_proto(icmp_t *icmp)
549 {
550 	conn_t	*connp = icmp->icmp_connp;
551 
552 	mutex_enter(&connp->conn_lock);
553 	connp->conn_saddr_v6 = ipv6_all_zeros;
554 	connp->conn_laddr_v6 = ipv6_all_zeros;
555 	connp->conn_faddr_v6 = ipv6_all_zeros;
556 	connp->conn_v6lastdst = ipv6_all_zeros;
557 	mutex_exit(&connp->conn_lock);
558 
559 	(void) ip_laddr_fanout_insert(connp);
560 }
561 
562 /*
563  * This routine handles each T_CONN_REQ message passed to icmp.  It
564  * associates a default destination address with the stream.
565  *
566  * After various error checks are completed, icmp_connect() lays
567  * the target address and port into the composite header template.
568  * Then we ask IP for information, including a source address if we didn't
569  * already have one. Finally we send up the T_OK_ACK reply message.
570  */
571 static void
572 icmp_tpi_connect(queue_t *q, mblk_t *mp)
573 {
574 	conn_t	*connp = Q_TO_CONN(q);
575 	struct T_conn_req	*tcr;
576 	struct sockaddr *sa;
577 	socklen_t len;
578 	int error;
579 	cred_t *cr;
580 	pid_t pid;
581 	/*
582 	 * All Solaris components should pass a db_credp
583 	 * for this TPI message, hence we ASSERT.
584 	 * But in case there is some other M_PROTO that looks
585 	 * like a TPI message sent by some other kernel
586 	 * component, we check and return an error.
587 	 */
588 	cr = msg_getcred(mp, &pid);
589 	ASSERT(cr != NULL);
590 	if (cr == NULL) {
591 		icmp_err_ack(q, mp, TSYSERR, EINVAL);
592 		return;
593 	}
594 
595 	tcr = (struct T_conn_req *)mp->b_rptr;
596 	/* Sanity checks */
597 	if ((mp->b_wptr - mp->b_rptr) < sizeof (struct T_conn_req)) {
598 		icmp_err_ack(q, mp, TPROTO, 0);
599 		return;
600 	}
601 
602 	if (tcr->OPT_length != 0) {
603 		icmp_err_ack(q, mp, TBADOPT, 0);
604 		return;
605 	}
606 
607 	len = tcr->DEST_length;
608 
609 	switch (len) {
610 	default:
611 		icmp_err_ack(q, mp, TBADADDR, 0);
612 		return;
613 	case sizeof (sin_t):
614 		sa = (struct sockaddr *)mi_offset_param(mp, tcr->DEST_offset,
615 		    sizeof (sin_t));
616 		break;
617 	case sizeof (sin6_t):
618 		sa = (struct sockaddr *)mi_offset_param(mp,
619 		    tcr->DEST_offset, sizeof (sin6_t));
620 		break;
621 	}
622 
623 	error = proto_verify_ip_addr(connp->conn_family, sa, len);
624 	if (error != 0) {
625 		icmp_err_ack(q, mp, TSYSERR, error);
626 		return;
627 	}
628 
629 	error = rawip_do_connect(connp, sa, len, cr, pid);
630 	if (error != 0) {
631 		if (error < 0) {
632 			icmp_err_ack(q, mp, -error, 0);
633 		} else {
634 			icmp_err_ack(q, mp, 0, error);
635 		}
636 	} else {
637 		mblk_t *mp1;
638 
639 		/*
640 		 * We have to send a connection confirmation to
641 		 * keep TLI happy.
642 		 */
643 		if (connp->conn_family == AF_INET) {
644 			mp1 = mi_tpi_conn_con(NULL, (char *)sa,
645 			    sizeof (sin_t), NULL, 0);
646 		} else {
647 			ASSERT(connp->conn_family == AF_INET6);
648 			mp1 = mi_tpi_conn_con(NULL, (char *)sa,
649 			    sizeof (sin6_t), NULL, 0);
650 		}
651 		if (mp1 == NULL) {
652 			icmp_err_ack(q, mp, TSYSERR, ENOMEM);
653 			return;
654 		}
655 
656 		/*
657 		 * Send ok_ack for T_CONN_REQ
658 		 */
659 		mp = mi_tpi_ok_ack_alloc(mp);
660 		if (mp == NULL) {
661 			/* Unable to reuse the T_CONN_REQ for the ack. */
662 			icmp_err_ack_prim(q, mp1, T_CONN_REQ, TSYSERR, ENOMEM);
663 			return;
664 		}
665 		putnext(connp->conn_rq, mp);
666 		putnext(connp->conn_rq, mp1);
667 	}
668 }
669 
670 static int
671 rawip_do_connect(conn_t *connp, const struct sockaddr *sa, socklen_t len,
672     cred_t *cr, pid_t pid)
673 {
674 	icmp_t		*icmp;
675 	sin_t		*sin;
676 	sin6_t		*sin6;
677 	int		error;
678 	uint16_t 	dstport;
679 	ipaddr_t	v4dst;
680 	in6_addr_t	v6dst;
681 	uint32_t	flowinfo;
682 	ip_xmit_attr_t	*ixa;
683 	uint_t		scopeid = 0;
684 	uint_t		srcid = 0;
685 	in6_addr_t	v6src = connp->conn_saddr_v6;
686 
687 	icmp = connp->conn_icmp;
688 
689 	if (sa == NULL || !OK_32PTR((char *)sa)) {
690 		return (EINVAL);
691 	}
692 
693 	ASSERT(sa != NULL && len != 0);
694 
695 	/*
696 	 * Determine packet type based on type of address passed in
697 	 * the request should contain an IPv4 or IPv6 address.
698 	 * Make sure that address family matches the type of
699 	 * family of the address passed down.
700 	 */
701 	switch (len) {
702 	case sizeof (sin_t):
703 		sin = (sin_t *)sa;
704 
705 		v4dst = sin->sin_addr.s_addr;
706 		dstport = sin->sin_port;
707 		IN6_IPADDR_TO_V4MAPPED(v4dst, &v6dst);
708 		ASSERT(connp->conn_ipversion == IPV4_VERSION);
709 		break;
710 
711 	case sizeof (sin6_t):
712 		sin6 = (sin6_t *)sa;
713 
714 		/* No support for mapped addresses on raw sockets */
715 		if (IN6_IS_ADDR_V4MAPPED(&sin6->sin6_addr)) {
716 			return (EADDRNOTAVAIL);
717 		}
718 		v6dst = sin6->sin6_addr;
719 		dstport = sin6->sin6_port;
720 		ASSERT(connp->conn_ipversion == IPV6_VERSION);
721 		flowinfo = sin6->sin6_flowinfo;
722 		if (IN6_IS_ADDR_LINKLOCAL(&sin6->sin6_addr))
723 			scopeid = sin6->sin6_scope_id;
724 		srcid = sin6->__sin6_src_id;
725 		if (srcid != 0 && IN6_IS_ADDR_UNSPECIFIED(&v6src)) {
726 			ip_srcid_find_id(srcid, &v6src, IPCL_ZONEID(connp),
727 			    connp->conn_netstack);
728 		}
729 		break;
730 	}
731 
732 	/*
733 	 * If there is a different thread using conn_ixa then we get a new
734 	 * copy and cut the old one loose from conn_ixa. Otherwise we use
735 	 * conn_ixa and prevent any other thread from using/changing it.
736 	 * Once connect() is done other threads can use conn_ixa since the
737 	 * refcnt will be back at one.
738 	 */
739 	ixa = conn_get_ixa(connp, B_TRUE);
740 	if (ixa == NULL)
741 		return (ENOMEM);
742 
743 	ASSERT(ixa->ixa_refcnt >= 2);
744 	ASSERT(ixa == connp->conn_ixa);
745 
746 	mutex_enter(&connp->conn_lock);
747 	/*
748 	 * This icmp_t must have bound already before doing a connect.
749 	 * Reject if a connect is in progress (we drop conn_lock during
750 	 * rawip_do_connect).
751 	 */
752 	if (icmp->icmp_state == TS_UNBND || icmp->icmp_state == TS_WCON_CREQ) {
753 		mutex_exit(&connp->conn_lock);
754 		ixa_refrele(ixa);
755 		return (-TOUTSTATE);
756 	}
757 
758 	if (icmp->icmp_state == TS_DATA_XFER) {
759 		/* Already connected - clear out state */
760 		if (connp->conn_mcbc_bind)
761 			connp->conn_saddr_v6 = ipv6_all_zeros;
762 		else
763 			connp->conn_saddr_v6 = connp->conn_bound_addr_v6;
764 		connp->conn_laddr_v6 = connp->conn_bound_addr_v6;
765 		connp->conn_faddr_v6 = ipv6_all_zeros;
766 		icmp->icmp_state = TS_IDLE;
767 	}
768 
769 	/*
770 	 * Use sin_port/sin6_port since applications like psh use SOCK_RAW
771 	 * with IPPROTO_TCP.
772 	 */
773 	connp->conn_fport = dstport;
774 	if (connp->conn_ipversion == IPV4_VERSION) {
775 		/*
776 		 * Interpret a zero destination to mean loopback.
777 		 * Update the T_CONN_REQ (sin/sin6) since it is used to
778 		 * generate the T_CONN_CON.
779 		 */
780 		if (v4dst == INADDR_ANY) {
781 			v4dst = htonl(INADDR_LOOPBACK);
782 			IN6_IPADDR_TO_V4MAPPED(v4dst, &v6dst);
783 			ASSERT(connp->conn_family == AF_INET);
784 			sin->sin_addr.s_addr = v4dst;
785 		}
786 		connp->conn_faddr_v6 = v6dst;
787 		connp->conn_flowinfo = 0;
788 	} else {
789 		ASSERT(connp->conn_ipversion == IPV6_VERSION);
790 		/*
791 		 * Interpret a zero destination to mean loopback.
792 		 * Update the T_CONN_REQ (sin/sin6) since it is used to
793 		 * generate the T_CONN_CON.
794 		 */
795 		if (IN6_IS_ADDR_UNSPECIFIED(&v6dst)) {
796 			v6dst = ipv6_loopback;
797 			sin6->sin6_addr = v6dst;
798 		}
799 		connp->conn_faddr_v6 = v6dst;
800 		connp->conn_flowinfo = flowinfo;
801 	}
802 
803 	ixa->ixa_cred = cr;
804 	ixa->ixa_cpid = pid;
805 	if (is_system_labeled()) {
806 		/* We need to restart with a label based on the cred */
807 		ip_xmit_attr_restore_tsl(ixa, ixa->ixa_cred);
808 	}
809 
810 	if (scopeid != 0) {
811 		ixa->ixa_flags |= IXAF_SCOPEID_SET;
812 		ixa->ixa_scopeid = scopeid;
813 		connp->conn_incoming_ifindex = scopeid;
814 	} else {
815 		ixa->ixa_flags &= ~IXAF_SCOPEID_SET;
816 		connp->conn_incoming_ifindex = connp->conn_bound_if;
817 	}
818 
819 	/*
820 	 * conn_connect will drop conn_lock and reacquire it.
821 	 * To prevent a send* from messing with this icmp_t while the lock
822 	 * is dropped we set icmp_state and clear conn_v6lastdst.
823 	 * That will make all send* fail with EISCONN.
824 	 */
825 	connp->conn_v6lastdst = ipv6_all_zeros;
826 	icmp->icmp_state = TS_WCON_CREQ;
827 
828 	error = conn_connect(connp, NULL, IPDF_ALLOW_MCBC);
829 	mutex_exit(&connp->conn_lock);
830 	if (error != 0)
831 		goto connect_failed;
832 
833 	/*
834 	 * The addresses have been verified. Time to insert in
835 	 * the correct fanout list.
836 	 */
837 	error = ipcl_conn_insert(connp);
838 	if (error != 0)
839 		goto connect_failed;
840 
841 	mutex_enter(&connp->conn_lock);
842 	error = icmp_build_hdr_template(connp, &connp->conn_saddr_v6,
843 	    &connp->conn_faddr_v6, connp->conn_flowinfo);
844 	if (error != 0) {
845 		mutex_exit(&connp->conn_lock);
846 		goto connect_failed;
847 	}
848 
849 	icmp->icmp_state = TS_DATA_XFER;
850 	/* Record this as the "last" send even though we haven't sent any */
851 	connp->conn_v6lastdst = connp->conn_faddr_v6;
852 	connp->conn_lastipversion = connp->conn_ipversion;
853 	connp->conn_lastdstport = connp->conn_fport;
854 	connp->conn_lastflowinfo = connp->conn_flowinfo;
855 	connp->conn_lastscopeid = scopeid;
856 	connp->conn_lastsrcid = srcid;
857 	/* Also remember a source to use together with lastdst */
858 	connp->conn_v6lastsrc = v6src;
859 	mutex_exit(&connp->conn_lock);
860 
861 	ixa_refrele(ixa);
862 	return (0);
863 
864 connect_failed:
865 	if (ixa != NULL)
866 		ixa_refrele(ixa);
867 	mutex_enter(&connp->conn_lock);
868 	icmp->icmp_state = TS_IDLE;
869 	/* In case the source address was set above */
870 	if (connp->conn_mcbc_bind)
871 		connp->conn_saddr_v6 = ipv6_all_zeros;
872 	else
873 		connp->conn_saddr_v6 = connp->conn_bound_addr_v6;
874 	connp->conn_laddr_v6 = connp->conn_bound_addr_v6;
875 	connp->conn_faddr_v6 = ipv6_all_zeros;
876 	connp->conn_v6lastdst = ipv6_all_zeros;
877 	connp->conn_flowinfo = 0;
878 
879 	(void) icmp_build_hdr_template(connp, &connp->conn_saddr_v6,
880 	    &connp->conn_faddr_v6, connp->conn_flowinfo);
881 	mutex_exit(&connp->conn_lock);
882 	return (error);
883 }
884 
885 static void
886 rawip_do_close(conn_t *connp)
887 {
888 	ASSERT(connp != NULL && IPCL_IS_RAWIP(connp));
889 
890 	ip_quiesce_conn(connp);
891 
892 	if (!IPCL_IS_NONSTR(connp)) {
893 		qprocsoff(connp->conn_rq);
894 	}
895 
896 	icmp_close_free(connp);
897 
898 	/*
899 	 * Now we are truly single threaded on this stream, and can
900 	 * delete the things hanging off the connp, and finally the connp.
901 	 * We removed this connp from the fanout list, it cannot be
902 	 * accessed thru the fanouts, and we already waited for the
903 	 * conn_ref to drop to 0. We are already in close, so
904 	 * there cannot be any other thread from the top. qprocsoff
905 	 * has completed, and service has completed or won't run in
906 	 * future.
907 	 */
908 	ASSERT(connp->conn_ref == 1);
909 
910 	if (!IPCL_IS_NONSTR(connp)) {
911 		inet_minor_free(connp->conn_minor_arena, connp->conn_dev);
912 	} else {
913 		ip_free_helper_stream(connp);
914 	}
915 
916 	connp->conn_ref--;
917 	ipcl_conn_destroy(connp);
918 }
919 
920 static int
921 icmp_close(queue_t *q, int flags)
922 {
923 	conn_t  *connp;
924 
925 	if (flags & SO_FALLBACK) {
926 		/*
927 		 * stream is being closed while in fallback
928 		 * simply free the resources that were allocated
929 		 */
930 		inet_minor_free(WR(q)->q_ptr, (dev_t)(RD(q)->q_ptr));
931 		qprocsoff(q);
932 		goto done;
933 	}
934 
935 	connp = Q_TO_CONN(q);
936 	(void) rawip_do_close(connp);
937 done:
938 	q->q_ptr = WR(q)->q_ptr = NULL;
939 	return (0);
940 }
941 
942 static void
943 icmp_close_free(conn_t *connp)
944 {
945 	icmp_t *icmp = connp->conn_icmp;
946 
947 	if (icmp->icmp_filter != NULL) {
948 		kmem_free(icmp->icmp_filter, sizeof (icmp6_filter_t));
949 		icmp->icmp_filter = NULL;
950 	}
951 
952 	/*
953 	 * Clear any fields which the kmem_cache constructor clears.
954 	 * Only icmp_connp needs to be preserved.
955 	 * TBD: We should make this more efficient to avoid clearing
956 	 * everything.
957 	 */
958 	ASSERT(icmp->icmp_connp == connp);
959 	bzero(icmp, sizeof (icmp_t));
960 	icmp->icmp_connp = connp;
961 }
962 
963 /*
964  * This routine handles each T_DISCON_REQ message passed to icmp
965  * as an indicating that ICMP is no longer connected. This results
966  * in telling IP to restore the binding to just the local address.
967  */
968 static int
969 icmp_do_disconnect(conn_t *connp)
970 {
971 	icmp_t	*icmp = connp->conn_icmp;
972 	int	error;
973 
974 	mutex_enter(&connp->conn_lock);
975 	if (icmp->icmp_state != TS_DATA_XFER) {
976 		mutex_exit(&connp->conn_lock);
977 		return (-TOUTSTATE);
978 	}
979 	if (connp->conn_mcbc_bind)
980 		connp->conn_saddr_v6 = ipv6_all_zeros;
981 	else
982 		connp->conn_saddr_v6 = connp->conn_bound_addr_v6;
983 	connp->conn_laddr_v6 = connp->conn_bound_addr_v6;
984 	connp->conn_faddr_v6 = ipv6_all_zeros;
985 	icmp->icmp_state = TS_IDLE;
986 
987 	connp->conn_v6lastdst = ipv6_all_zeros;
988 	error = icmp_build_hdr_template(connp, &connp->conn_saddr_v6,
989 	    &connp->conn_faddr_v6, connp->conn_flowinfo);
990 	mutex_exit(&connp->conn_lock);
991 	if (error != 0)
992 		return (error);
993 
994 	/*
995 	 * Tell IP to remove the full binding and revert
996 	 * to the local address binding.
997 	 */
998 	return (ip_laddr_fanout_insert(connp));
999 }
1000 
1001 static void
1002 icmp_tpi_disconnect(queue_t *q, mblk_t *mp)
1003 {
1004 	conn_t	*connp = Q_TO_CONN(q);
1005 	int	error;
1006 
1007 	/*
1008 	 * Allocate the largest primitive we need to send back
1009 	 * T_error_ack is > than T_ok_ack
1010 	 */
1011 	mp = reallocb(mp, sizeof (struct T_error_ack), 1);
1012 	if (mp == NULL) {
1013 		/* Unable to reuse the T_DISCON_REQ for the ack. */
1014 		icmp_err_ack_prim(q, mp, T_DISCON_REQ, TSYSERR, ENOMEM);
1015 		return;
1016 	}
1017 
1018 	error = icmp_do_disconnect(connp);
1019 
1020 	if (error != 0) {
1021 		if (error > 0) {
1022 			icmp_err_ack(q, mp, 0, error);
1023 		} else {
1024 			icmp_err_ack(q, mp, -error, 0);
1025 		}
1026 	} else {
1027 		mp = mi_tpi_ok_ack_alloc(mp);
1028 		ASSERT(mp != NULL);
1029 		qreply(q, mp);
1030 	}
1031 }
1032 
1033 static int
1034 icmp_disconnect(conn_t *connp)
1035 {
1036 	int	error;
1037 
1038 	connp->conn_dgram_errind = B_FALSE;
1039 
1040 	error = icmp_do_disconnect(connp);
1041 
1042 	if (error < 0)
1043 		error = proto_tlitosyserr(-error);
1044 	return (error);
1045 }
1046 
1047 /* This routine creates a T_ERROR_ACK message and passes it upstream. */
1048 static void
1049 icmp_err_ack(queue_t *q, mblk_t *mp, t_scalar_t t_error, int sys_error)
1050 {
1051 	if ((mp = mi_tpi_err_ack_alloc(mp, t_error, sys_error)) != NULL)
1052 		qreply(q, mp);
1053 }
1054 
1055 /* Shorthand to generate and send TPI error acks to our client */
1056 static void
1057 icmp_err_ack_prim(queue_t *q, mblk_t *mp, t_scalar_t primitive,
1058     t_scalar_t t_error, int sys_error)
1059 {
1060 	struct T_error_ack	*teackp;
1061 
1062 	if ((mp = tpi_ack_alloc(mp, sizeof (struct T_error_ack),
1063 	    M_PCPROTO, T_ERROR_ACK)) != NULL) {
1064 		teackp = (struct T_error_ack *)mp->b_rptr;
1065 		teackp->ERROR_prim = primitive;
1066 		teackp->TLI_error = t_error;
1067 		teackp->UNIX_error = sys_error;
1068 		qreply(q, mp);
1069 	}
1070 }
1071 
1072 /*
1073  * icmp_icmp_input is called as conn_recvicmp to process ICMP messages.
1074  * Generates the appropriate T_UDERROR_IND for permanent (non-transient) errors.
1075  * Assumes that IP has pulled up everything up to and including the ICMP header.
1076  */
1077 /* ARGSUSED2 */
1078 static void
1079 icmp_icmp_input(void *arg1, mblk_t *mp, void *arg2, ip_recv_attr_t *ira)
1080 {
1081 	conn_t		*connp = (conn_t *)arg1;
1082 	icmp_t		*icmp = connp->conn_icmp;
1083 	icmph_t		*icmph;
1084 	ipha_t		*ipha;
1085 	int		iph_hdr_length;
1086 	sin_t		sin;
1087 	mblk_t		*mp1;
1088 	int		error = 0;
1089 
1090 	ipha = (ipha_t *)mp->b_rptr;
1091 
1092 	ASSERT(OK_32PTR(mp->b_rptr));
1093 
1094 	if (IPH_HDR_VERSION(ipha) != IPV4_VERSION) {
1095 		ASSERT(IPH_HDR_VERSION(ipha) == IPV6_VERSION);
1096 		icmp_icmp_error_ipv6(connp, mp, ira);
1097 		return;
1098 	}
1099 	ASSERT(IPH_HDR_VERSION(ipha) == IPV4_VERSION);
1100 
1101 	/* Skip past the outer IP and ICMP headers */
1102 	ASSERT(IPH_HDR_LENGTH(ipha) == ira->ira_ip_hdr_length);
1103 	iph_hdr_length = ira->ira_ip_hdr_length;
1104 	icmph = (icmph_t *)&mp->b_rptr[iph_hdr_length];
1105 	ipha = (ipha_t *)&icmph[1];	/* Inner IP header */
1106 
1107 	iph_hdr_length = IPH_HDR_LENGTH(ipha);
1108 
1109 	switch (icmph->icmph_type) {
1110 	case ICMP_DEST_UNREACHABLE:
1111 		switch (icmph->icmph_code) {
1112 		case ICMP_FRAGMENTATION_NEEDED: {
1113 			ipha_t		*ipha;
1114 			ip_xmit_attr_t	*ixa;
1115 			/*
1116 			 * IP has already adjusted the path MTU.
1117 			 * But we need to adjust DF for IPv4.
1118 			 */
1119 			if (connp->conn_ipversion != IPV4_VERSION)
1120 				break;
1121 
1122 			ixa = conn_get_ixa(connp, B_FALSE);
1123 			if (ixa == NULL || ixa->ixa_ire == NULL) {
1124 				/*
1125 				 * Some other thread holds conn_ixa. We will
1126 				 * redo this on the next ICMP too big.
1127 				 */
1128 				if (ixa != NULL)
1129 					ixa_refrele(ixa);
1130 				break;
1131 			}
1132 			(void) ip_get_pmtu(ixa);
1133 
1134 			mutex_enter(&connp->conn_lock);
1135 			ipha = (ipha_t *)connp->conn_ht_iphc;
1136 			if (ixa->ixa_flags & IXAF_PMTU_IPV4_DF) {
1137 				ipha->ipha_fragment_offset_and_flags |=
1138 				    IPH_DF_HTONS;
1139 			} else {
1140 				ipha->ipha_fragment_offset_and_flags &=
1141 				    ~IPH_DF_HTONS;
1142 			}
1143 			mutex_exit(&connp->conn_lock);
1144 			ixa_refrele(ixa);
1145 			break;
1146 		}
1147 		case ICMP_PORT_UNREACHABLE:
1148 		case ICMP_PROTOCOL_UNREACHABLE:
1149 			error = ECONNREFUSED;
1150 			break;
1151 		default:
1152 			/* Transient errors */
1153 			break;
1154 		}
1155 		break;
1156 	default:
1157 		/* Transient errors */
1158 		break;
1159 	}
1160 	if (error == 0) {
1161 		freemsg(mp);
1162 		return;
1163 	}
1164 
1165 	/*
1166 	 * Deliver T_UDERROR_IND when the application has asked for it.
1167 	 * The socket layer enables this automatically when connected.
1168 	 */
1169 	if (!connp->conn_dgram_errind) {
1170 		freemsg(mp);
1171 		return;
1172 	}
1173 
1174 	sin = sin_null;
1175 	sin.sin_family = AF_INET;
1176 	sin.sin_addr.s_addr = ipha->ipha_dst;
1177 
1178 	if (IPCL_IS_NONSTR(connp)) {
1179 		mutex_enter(&connp->conn_lock);
1180 		if (icmp->icmp_state == TS_DATA_XFER) {
1181 			if (sin.sin_addr.s_addr == connp->conn_faddr_v4) {
1182 				mutex_exit(&connp->conn_lock);
1183 				(*connp->conn_upcalls->su_set_error)
1184 				    (connp->conn_upper_handle, error);
1185 				goto done;
1186 			}
1187 		} else {
1188 			icmp->icmp_delayed_error = error;
1189 			*((sin_t *)&icmp->icmp_delayed_addr) = sin;
1190 		}
1191 		mutex_exit(&connp->conn_lock);
1192 	} else {
1193 		mp1 = mi_tpi_uderror_ind((char *)&sin, sizeof (sin_t), NULL, 0,
1194 		    error);
1195 		if (mp1 != NULL)
1196 			putnext(connp->conn_rq, mp1);
1197 	}
1198 done:
1199 	freemsg(mp);
1200 }
1201 
1202 /*
1203  * icmp_icmp_error_ipv6 is called by icmp_icmp_error to process ICMP for IPv6.
1204  * Generates the appropriate T_UDERROR_IND for permanent (non-transient) errors.
1205  * Assumes that IP has pulled up all the extension headers as well as the
1206  * ICMPv6 header.
1207  */
1208 static void
1209 icmp_icmp_error_ipv6(conn_t *connp, mblk_t *mp, ip_recv_attr_t *ira)
1210 {
1211 	icmp6_t		*icmp6;
1212 	ip6_t		*ip6h, *outer_ip6h;
1213 	uint16_t	iph_hdr_length;
1214 	uint8_t		*nexthdrp;
1215 	sin6_t		sin6;
1216 	mblk_t		*mp1;
1217 	int		error = 0;
1218 	icmp_t		*icmp = connp->conn_icmp;
1219 
1220 	outer_ip6h = (ip6_t *)mp->b_rptr;
1221 #ifdef DEBUG
1222 	if (outer_ip6h->ip6_nxt != IPPROTO_ICMPV6)
1223 		iph_hdr_length = ip_hdr_length_v6(mp, outer_ip6h);
1224 	else
1225 		iph_hdr_length = IPV6_HDR_LEN;
1226 	ASSERT(iph_hdr_length == ira->ira_ip_hdr_length);
1227 #endif
1228 	/* Skip past the outer IP and ICMP headers */
1229 	iph_hdr_length = ira->ira_ip_hdr_length;
1230 	icmp6 = (icmp6_t *)&mp->b_rptr[iph_hdr_length];
1231 
1232 	ip6h = (ip6_t *)&icmp6[1];	/* Inner IP header */
1233 	if (!ip_hdr_length_nexthdr_v6(mp, ip6h, &iph_hdr_length, &nexthdrp)) {
1234 		freemsg(mp);
1235 		return;
1236 	}
1237 
1238 	switch (icmp6->icmp6_type) {
1239 	case ICMP6_DST_UNREACH:
1240 		switch (icmp6->icmp6_code) {
1241 		case ICMP6_DST_UNREACH_NOPORT:
1242 			error = ECONNREFUSED;
1243 			break;
1244 		case ICMP6_DST_UNREACH_ADMIN:
1245 		case ICMP6_DST_UNREACH_NOROUTE:
1246 		case ICMP6_DST_UNREACH_BEYONDSCOPE:
1247 		case ICMP6_DST_UNREACH_ADDR:
1248 			/* Transient errors */
1249 			break;
1250 		default:
1251 			break;
1252 		}
1253 		break;
1254 	case ICMP6_PACKET_TOO_BIG: {
1255 		struct T_unitdata_ind	*tudi;
1256 		struct T_opthdr		*toh;
1257 		size_t			udi_size;
1258 		mblk_t			*newmp;
1259 		t_scalar_t		opt_length = sizeof (struct T_opthdr) +
1260 		    sizeof (struct ip6_mtuinfo);
1261 		sin6_t			*sin6;
1262 		struct ip6_mtuinfo	*mtuinfo;
1263 
1264 		/*
1265 		 * If the application has requested to receive path mtu
1266 		 * information, send up an empty message containing an
1267 		 * IPV6_PATHMTU ancillary data item.
1268 		 */
1269 		if (!connp->conn_ipv6_recvpathmtu)
1270 			break;
1271 
1272 		udi_size = sizeof (struct T_unitdata_ind) + sizeof (sin6_t) +
1273 		    opt_length;
1274 		if ((newmp = allocb(udi_size, BPRI_MED)) == NULL) {
1275 			BUMP_MIB(&icmp->icmp_is->is_rawip_mib, rawipInErrors);
1276 			break;
1277 		}
1278 
1279 		/*
1280 		 * newmp->b_cont is left to NULL on purpose.  This is an
1281 		 * empty message containing only ancillary data.
1282 		 */
1283 		newmp->b_datap->db_type = M_PROTO;
1284 		tudi = (struct T_unitdata_ind *)newmp->b_rptr;
1285 		newmp->b_wptr = (uchar_t *)tudi + udi_size;
1286 		tudi->PRIM_type = T_UNITDATA_IND;
1287 		tudi->SRC_length = sizeof (sin6_t);
1288 		tudi->SRC_offset = sizeof (struct T_unitdata_ind);
1289 		tudi->OPT_offset = tudi->SRC_offset + sizeof (sin6_t);
1290 		tudi->OPT_length = opt_length;
1291 
1292 		sin6 = (sin6_t *)&tudi[1];
1293 		bzero(sin6, sizeof (sin6_t));
1294 		sin6->sin6_family = AF_INET6;
1295 		sin6->sin6_addr = connp->conn_faddr_v6;
1296 
1297 		toh = (struct T_opthdr *)&sin6[1];
1298 		toh->level = IPPROTO_IPV6;
1299 		toh->name = IPV6_PATHMTU;
1300 		toh->len = opt_length;
1301 		toh->status = 0;
1302 
1303 		mtuinfo = (struct ip6_mtuinfo *)&toh[1];
1304 		bzero(mtuinfo, sizeof (struct ip6_mtuinfo));
1305 		mtuinfo->ip6m_addr.sin6_family = AF_INET6;
1306 		mtuinfo->ip6m_addr.sin6_addr = ip6h->ip6_dst;
1307 		mtuinfo->ip6m_mtu = icmp6->icmp6_mtu;
1308 		/*
1309 		 * We've consumed everything we need from the original
1310 		 * message.  Free it, then send our empty message.
1311 		 */
1312 		freemsg(mp);
1313 		icmp_ulp_recv(connp, newmp, msgdsize(newmp));
1314 		return;
1315 	}
1316 	case ICMP6_TIME_EXCEEDED:
1317 		/* Transient errors */
1318 		break;
1319 	case ICMP6_PARAM_PROB:
1320 		/* If this corresponds to an ICMP_PROTOCOL_UNREACHABLE */
1321 		if (icmp6->icmp6_code == ICMP6_PARAMPROB_NEXTHEADER &&
1322 		    (uchar_t *)ip6h + icmp6->icmp6_pptr ==
1323 		    (uchar_t *)nexthdrp) {
1324 			error = ECONNREFUSED;
1325 			break;
1326 		}
1327 		break;
1328 	}
1329 	if (error == 0) {
1330 		freemsg(mp);
1331 		return;
1332 	}
1333 
1334 	/*
1335 	 * Deliver T_UDERROR_IND when the application has asked for it.
1336 	 * The socket layer enables this automatically when connected.
1337 	 */
1338 	if (!connp->conn_dgram_errind) {
1339 		freemsg(mp);
1340 		return;
1341 	}
1342 
1343 	sin6 = sin6_null;
1344 	sin6.sin6_family = AF_INET6;
1345 	sin6.sin6_addr = ip6h->ip6_dst;
1346 	sin6.sin6_flowinfo = ip6h->ip6_vcf & ~IPV6_VERS_AND_FLOW_MASK;
1347 	if (IPCL_IS_NONSTR(connp)) {
1348 		mutex_enter(&connp->conn_lock);
1349 		if (icmp->icmp_state == TS_DATA_XFER) {
1350 			if (IN6_ARE_ADDR_EQUAL(&sin6.sin6_addr,
1351 			    &connp->conn_faddr_v6)) {
1352 				mutex_exit(&connp->conn_lock);
1353 				(*connp->conn_upcalls->su_set_error)
1354 				    (connp->conn_upper_handle, error);
1355 				goto done;
1356 			}
1357 		} else {
1358 			icmp->icmp_delayed_error = error;
1359 			*((sin6_t *)&icmp->icmp_delayed_addr) = sin6;
1360 		}
1361 		mutex_exit(&connp->conn_lock);
1362 	} else {
1363 		mp1 = mi_tpi_uderror_ind((char *)&sin6, sizeof (sin6_t),
1364 		    NULL, 0, error);
1365 		if (mp1 != NULL)
1366 			putnext(connp->conn_rq, mp1);
1367 	}
1368 done:
1369 	freemsg(mp);
1370 }
1371 
1372 /*
1373  * This routine responds to T_ADDR_REQ messages.  It is called by icmp_wput.
1374  * The local address is filled in if endpoint is bound. The remote address
1375  * is filled in if remote address has been precified ("connected endpoint")
1376  * (The concept of connected CLTS sockets is alien to published TPI
1377  *  but we support it anyway).
1378  */
1379 static void
1380 icmp_addr_req(queue_t *q, mblk_t *mp)
1381 {
1382 	struct sockaddr *sa;
1383 	mblk_t	*ackmp;
1384 	struct T_addr_ack *taa;
1385 	icmp_t	*icmp = Q_TO_ICMP(q);
1386 	conn_t	*connp = icmp->icmp_connp;
1387 	uint_t	addrlen;
1388 
1389 	/* Make it large enough for worst case */
1390 	ackmp = reallocb(mp, sizeof (struct T_addr_ack) +
1391 	    2 * sizeof (sin6_t), 1);
1392 	if (ackmp == NULL) {
1393 		icmp_err_ack(q, mp, TSYSERR, ENOMEM);
1394 		return;
1395 	}
1396 	taa = (struct T_addr_ack *)ackmp->b_rptr;
1397 
1398 	bzero(taa, sizeof (struct T_addr_ack));
1399 	ackmp->b_wptr = (uchar_t *)&taa[1];
1400 
1401 	taa->PRIM_type = T_ADDR_ACK;
1402 	ackmp->b_datap->db_type = M_PCPROTO;
1403 
1404 	if (connp->conn_family == AF_INET)
1405 		addrlen = sizeof (sin_t);
1406 	else
1407 		addrlen = sizeof (sin6_t);
1408 
1409 	mutex_enter(&connp->conn_lock);
1410 	/*
1411 	 * Note: Following code assumes 32 bit alignment of basic
1412 	 * data structures like sin_t and struct T_addr_ack.
1413 	 */
1414 	if (icmp->icmp_state != TS_UNBND) {
1415 		/*
1416 		 * Fill in local address first
1417 		 */
1418 		taa->LOCADDR_offset = sizeof (*taa);
1419 		taa->LOCADDR_length = addrlen;
1420 		sa = (struct sockaddr *)&taa[1];
1421 		(void) conn_getsockname(connp, sa, &addrlen);
1422 		ackmp->b_wptr += addrlen;
1423 	}
1424 	if (icmp->icmp_state == TS_DATA_XFER) {
1425 		/*
1426 		 * connected, fill remote address too
1427 		 */
1428 		taa->REMADDR_length = addrlen;
1429 		/* assumed 32-bit alignment */
1430 		taa->REMADDR_offset = taa->LOCADDR_offset + taa->LOCADDR_length;
1431 		sa = (struct sockaddr *)(ackmp->b_rptr + taa->REMADDR_offset);
1432 		(void) conn_getpeername(connp, sa, &addrlen);
1433 		ackmp->b_wptr += addrlen;
1434 	}
1435 	mutex_exit(&connp->conn_lock);
1436 	ASSERT(ackmp->b_wptr <= ackmp->b_datap->db_lim);
1437 	qreply(q, ackmp);
1438 }
1439 
1440 static void
1441 icmp_copy_info(struct T_info_ack *tap, icmp_t *icmp)
1442 {
1443 	conn_t		*connp = icmp->icmp_connp;
1444 
1445 	*tap = icmp_g_t_info_ack;
1446 
1447 	if (connp->conn_family == AF_INET6)
1448 		tap->ADDR_size = sizeof (sin6_t);
1449 	else
1450 		tap->ADDR_size = sizeof (sin_t);
1451 	tap->CURRENT_state = icmp->icmp_state;
1452 	tap->OPT_size = icmp_max_optsize;
1453 }
1454 
1455 static void
1456 icmp_do_capability_ack(icmp_t *icmp, struct T_capability_ack *tcap,
1457     t_uscalar_t cap_bits1)
1458 {
1459 	tcap->CAP_bits1 = 0;
1460 
1461 	if (cap_bits1 & TC1_INFO) {
1462 		icmp_copy_info(&tcap->INFO_ack, icmp);
1463 		tcap->CAP_bits1 |= TC1_INFO;
1464 	}
1465 }
1466 
1467 /*
1468  * This routine responds to T_CAPABILITY_REQ messages.  It is called by
1469  * icmp_wput.  Much of the T_CAPABILITY_ACK information is copied from
1470  * icmp_g_t_info_ack.  The current state of the stream is copied from
1471  * icmp_state.
1472  */
1473 static void
1474 icmp_capability_req(queue_t *q, mblk_t *mp)
1475 {
1476 	icmp_t			*icmp = Q_TO_ICMP(q);
1477 	t_uscalar_t		cap_bits1;
1478 	struct T_capability_ack	*tcap;
1479 
1480 	cap_bits1 = ((struct T_capability_req *)mp->b_rptr)->CAP_bits1;
1481 
1482 	mp = tpi_ack_alloc(mp, sizeof (struct T_capability_ack),
1483 	    mp->b_datap->db_type, T_CAPABILITY_ACK);
1484 	if (!mp)
1485 		return;
1486 
1487 	tcap = (struct T_capability_ack *)mp->b_rptr;
1488 
1489 	icmp_do_capability_ack(icmp, tcap, cap_bits1);
1490 
1491 	qreply(q, mp);
1492 }
1493 
1494 /*
1495  * This routine responds to T_INFO_REQ messages.  It is called by icmp_wput.
1496  * Most of the T_INFO_ACK information is copied from icmp_g_t_info_ack.
1497  * The current state of the stream is copied from icmp_state.
1498  */
1499 static void
1500 icmp_info_req(queue_t *q, mblk_t *mp)
1501 {
1502 	icmp_t	*icmp = Q_TO_ICMP(q);
1503 
1504 	/* Create a T_INFO_ACK message. */
1505 	mp = tpi_ack_alloc(mp, sizeof (struct T_info_ack), M_PCPROTO,
1506 	    T_INFO_ACK);
1507 	if (!mp)
1508 		return;
1509 	icmp_copy_info((struct T_info_ack *)mp->b_rptr, icmp);
1510 	qreply(q, mp);
1511 }
1512 
1513 static int
1514 icmp_tpi_open(queue_t *q, dev_t *devp, int flag, int sflag, cred_t *credp,
1515     int family)
1516 {
1517 	conn_t *connp;
1518 	dev_t	conn_dev;
1519 	int	error;
1520 
1521 	/* If the stream is already open, return immediately. */
1522 	if (q->q_ptr != NULL)
1523 		return (0);
1524 
1525 	if (sflag == MODOPEN)
1526 		return (EINVAL);
1527 
1528 	/*
1529 	 * Since ICMP is not used so heavily, allocating from the small
1530 	 * arena should be sufficient.
1531 	 */
1532 	if ((conn_dev = inet_minor_alloc(ip_minor_arena_sa)) == 0) {
1533 		return (EBUSY);
1534 	}
1535 
1536 	if (flag & SO_FALLBACK) {
1537 		/*
1538 		 * Non streams socket needs a stream to fallback to
1539 		 */
1540 		RD(q)->q_ptr = (void *)conn_dev;
1541 		WR(q)->q_qinfo = &icmp_fallback_sock_winit;
1542 		WR(q)->q_ptr = (void *)ip_minor_arena_sa;
1543 		qprocson(q);
1544 		return (0);
1545 	}
1546 
1547 	connp = rawip_do_open(family, credp, &error, KM_SLEEP);
1548 	if (connp == NULL) {
1549 		ASSERT(error != 0);
1550 		inet_minor_free(ip_minor_arena_sa, connp->conn_dev);
1551 		return (error);
1552 	}
1553 
1554 	*devp = makedevice(getemajor(*devp), (minor_t)conn_dev);
1555 	connp->conn_dev = conn_dev;
1556 	connp->conn_minor_arena = ip_minor_arena_sa;
1557 
1558 	/*
1559 	 * Initialize the icmp_t structure for this stream.
1560 	 */
1561 	q->q_ptr = connp;
1562 	WR(q)->q_ptr = connp;
1563 	connp->conn_rq = q;
1564 	connp->conn_wq = WR(q);
1565 
1566 	WR(q)->q_hiwat = connp->conn_sndbuf;
1567 	WR(q)->q_lowat = connp->conn_sndlowat;
1568 
1569 	qprocson(q);
1570 
1571 	/* Set the Stream head write offset. */
1572 	(void) proto_set_tx_wroff(q, connp, connp->conn_wroff);
1573 	(void) proto_set_rx_hiwat(connp->conn_rq, connp, connp->conn_rcvbuf);
1574 
1575 	mutex_enter(&connp->conn_lock);
1576 	connp->conn_state_flags &= ~CONN_INCIPIENT;
1577 	mutex_exit(&connp->conn_lock);
1578 
1579 	icmp_bind_proto(connp->conn_icmp);
1580 
1581 	return (0);
1582 }
1583 
1584 /* For /dev/icmp aka AF_INET open */
1585 static int
1586 icmp_openv4(queue_t *q, dev_t *devp, int flag, int sflag, cred_t *credp)
1587 {
1588 	return (icmp_tpi_open(q, devp, flag, sflag, credp, AF_INET));
1589 }
1590 
1591 /* For /dev/icmp6 aka AF_INET6 open */
1592 static int
1593 icmp_openv6(queue_t *q, dev_t *devp, int flag, int sflag, cred_t *credp)
1594 {
1595 	return (icmp_tpi_open(q, devp, flag, sflag, credp, AF_INET6));
1596 }
1597 
1598 /*
1599  * This is the open routine for icmp.  It allocates a icmp_t structure for
1600  * the stream and, on the first open of the module, creates an ND table.
1601  */
1602 static conn_t *
1603 rawip_do_open(int family, cred_t *credp, int *err, int flags)
1604 {
1605 	icmp_t	*icmp;
1606 	conn_t *connp;
1607 	zoneid_t zoneid;
1608 	netstack_t *ns;
1609 	icmp_stack_t *is;
1610 	int len;
1611 	boolean_t isv6 = B_FALSE;
1612 
1613 	*err = secpolicy_net_icmpaccess(credp);
1614 	if (*err != 0)
1615 		return (NULL);
1616 
1617 	if (family == AF_INET6)
1618 		isv6 = B_TRUE;
1619 
1620 	ns = netstack_find_by_cred(credp);
1621 	ASSERT(ns != NULL);
1622 	is = ns->netstack_icmp;
1623 	ASSERT(is != NULL);
1624 
1625 	/*
1626 	 * For exclusive stacks we set the zoneid to zero
1627 	 * to make ICMP operate as if in the global zone.
1628 	 */
1629 	if (ns->netstack_stackid != GLOBAL_NETSTACKID)
1630 		zoneid = GLOBAL_ZONEID;
1631 	else
1632 		zoneid = crgetzoneid(credp);
1633 
1634 	ASSERT(flags == KM_SLEEP || flags == KM_NOSLEEP);
1635 
1636 	connp = ipcl_conn_create(IPCL_RAWIPCONN, flags, ns);
1637 	icmp = connp->conn_icmp;
1638 
1639 	/*
1640 	 * ipcl_conn_create did a netstack_hold. Undo the hold that was
1641 	 * done by netstack_find_by_cred()
1642 	 */
1643 	netstack_rele(ns);
1644 
1645 	/*
1646 	 * Since this conn_t/icmp_t is not yet visible to anybody else we don't
1647 	 * need to lock anything.
1648 	 */
1649 	ASSERT(connp->conn_proto == IPPROTO_ICMP);
1650 	ASSERT(connp->conn_icmp == icmp);
1651 	ASSERT(icmp->icmp_connp == connp);
1652 
1653 	/* Set the initial state of the stream and the privilege status. */
1654 	icmp->icmp_state = TS_UNBND;
1655 	connp->conn_ixa->ixa_flags |= IXAF_VERIFY_SOURCE;
1656 	if (isv6) {
1657 		connp->conn_family = AF_INET6;
1658 		connp->conn_ipversion = IPV6_VERSION;
1659 		connp->conn_ixa->ixa_flags &= ~IXAF_IS_IPV4;
1660 		connp->conn_proto = IPPROTO_ICMPV6;
1661 		/* May be changed by a SO_PROTOTYPE socket option. */
1662 		connp->conn_proto = IPPROTO_ICMPV6;
1663 		connp->conn_ixa->ixa_protocol = connp->conn_proto;
1664 		connp->conn_ixa->ixa_raw_cksum_offset = 2;
1665 		connp->conn_default_ttl = is->is_ipv6_hoplimit;
1666 		len = sizeof (ip6_t);
1667 	} else {
1668 		connp->conn_family = AF_INET;
1669 		connp->conn_ipversion = IPV4_VERSION;
1670 		connp->conn_ixa->ixa_flags |= IXAF_IS_IPV4;
1671 		/* May be changed by a SO_PROTOTYPE socket option. */
1672 		connp->conn_proto = IPPROTO_ICMP;
1673 		connp->conn_ixa->ixa_protocol = connp->conn_proto;
1674 		connp->conn_default_ttl = is->is_ipv4_ttl;
1675 		len = sizeof (ipha_t);
1676 	}
1677 	connp->conn_xmit_ipp.ipp_unicast_hops = connp->conn_default_ttl;
1678 
1679 	connp->conn_ixa->ixa_multicast_ttl = IP_DEFAULT_MULTICAST_TTL;
1680 
1681 	/*
1682 	 * For the socket of protocol IPPROTO_RAW or when IP_HDRINCL is set,
1683 	 * the checksum is provided in the pre-built packet. We clear
1684 	 * IXAF_SET_ULP_CKSUM to tell IP that the application has sent a
1685 	 * complete IP header and not to compute the transport checksum.
1686 	 */
1687 	connp->conn_ixa->ixa_flags |= IXAF_MULTICAST_LOOP | IXAF_SET_ULP_CKSUM;
1688 	/* conn_allzones can not be set this early, hence no IPCL_ZONEID */
1689 	connp->conn_ixa->ixa_zoneid = zoneid;
1690 
1691 	connp->conn_zoneid = zoneid;
1692 
1693 	/*
1694 	 * If the caller has the process-wide flag set, then default to MAC
1695 	 * exempt mode.  This allows read-down to unlabeled hosts.
1696 	 */
1697 	if (getpflags(NET_MAC_AWARE, credp) != 0)
1698 		connp->conn_mac_mode = CONN_MAC_AWARE;
1699 
1700 	connp->conn_zone_is_global = (crgetzoneid(credp) == GLOBAL_ZONEID);
1701 
1702 	icmp->icmp_is = is;
1703 
1704 	connp->conn_rcvbuf = is->is_recv_hiwat;
1705 	connp->conn_sndbuf = is->is_xmit_hiwat;
1706 	connp->conn_sndlowat = is->is_xmit_lowat;
1707 	connp->conn_rcvlowat = icmp_mod_info.mi_lowat;
1708 
1709 	connp->conn_wroff = len + is->is_wroff_extra;
1710 	connp->conn_so_type = SOCK_RAW;
1711 
1712 	connp->conn_recv = icmp_input;
1713 	connp->conn_recvicmp = icmp_icmp_input;
1714 	crhold(credp);
1715 	connp->conn_cred = credp;
1716 	connp->conn_cpid = curproc->p_pid;
1717 	connp->conn_open_time = ddi_get_lbolt64();
1718 	/* Cache things in ixa without an extra refhold */
1719 	connp->conn_ixa->ixa_cred = connp->conn_cred;
1720 	connp->conn_ixa->ixa_cpid = connp->conn_cpid;
1721 	if (is_system_labeled())
1722 		connp->conn_ixa->ixa_tsl = crgetlabel(connp->conn_cred);
1723 
1724 	connp->conn_flow_cntrld = B_FALSE;
1725 
1726 	if (is->is_pmtu_discovery)
1727 		connp->conn_ixa->ixa_flags |= IXAF_PMTU_DISCOVERY;
1728 
1729 	return (connp);
1730 }
1731 
1732 /*
1733  * Which ICMP options OK to set through T_UNITDATA_REQ...
1734  */
1735 /* ARGSUSED */
1736 static boolean_t
1737 icmp_opt_allow_udr_set(t_scalar_t level, t_scalar_t name)
1738 {
1739 	return (B_TRUE);
1740 }
1741 
1742 /*
1743  * This routine gets default values of certain options whose default
1744  * values are maintained by protcol specific code
1745  */
1746 int
1747 icmp_opt_default(queue_t *q, t_scalar_t level, t_scalar_t name, uchar_t *ptr)
1748 {
1749 	icmp_t *icmp = Q_TO_ICMP(q);
1750 	icmp_stack_t *is = icmp->icmp_is;
1751 	int *i1 = (int *)ptr;
1752 
1753 	switch (level) {
1754 	case IPPROTO_IP:
1755 		switch (name) {
1756 		case IP_MULTICAST_TTL:
1757 			*ptr = (uchar_t)IP_DEFAULT_MULTICAST_TTL;
1758 			return (sizeof (uchar_t));
1759 		case IP_MULTICAST_LOOP:
1760 			*ptr = (uchar_t)IP_DEFAULT_MULTICAST_LOOP;
1761 			return (sizeof (uchar_t));
1762 		}
1763 		break;
1764 	case IPPROTO_IPV6:
1765 		switch (name) {
1766 		case IPV6_MULTICAST_HOPS:
1767 			*i1 = IP_DEFAULT_MULTICAST_TTL;
1768 			return (sizeof (int));
1769 		case IPV6_MULTICAST_LOOP:
1770 			*i1 = IP_DEFAULT_MULTICAST_LOOP;
1771 			return (sizeof (int));
1772 		case IPV6_UNICAST_HOPS:
1773 			*i1 = is->is_ipv6_hoplimit;
1774 			return (sizeof (int));
1775 		}
1776 		break;
1777 	case IPPROTO_ICMPV6:
1778 		switch (name) {
1779 		case ICMP6_FILTER:
1780 			/* Make it look like "pass all" */
1781 			ICMP6_FILTER_SETPASSALL((icmp6_filter_t *)ptr);
1782 			return (sizeof (icmp6_filter_t));
1783 		}
1784 		break;
1785 	}
1786 	return (-1);
1787 }
1788 
1789 /*
1790  * This routine retrieves the current status of socket options.
1791  * It returns the size of the option retrieved, or -1.
1792  */
1793 int
1794 icmp_opt_get(conn_t *connp, int level, int name, uchar_t *ptr)
1795 {
1796 	icmp_t		*icmp = connp->conn_icmp;
1797 	int		*i1 = (int *)ptr;
1798 	conn_opt_arg_t	coas;
1799 	int		retval;
1800 
1801 	coas.coa_connp = connp;
1802 	coas.coa_ixa = connp->conn_ixa;
1803 	coas.coa_ipp = &connp->conn_xmit_ipp;
1804 	coas.coa_ancillary = B_FALSE;
1805 	coas.coa_changed = 0;
1806 
1807 	/*
1808 	 * We assume that the optcom framework has checked for the set
1809 	 * of levels and names that are supported, hence we don't worry
1810 	 * about rejecting based on that.
1811 	 * First check for ICMP specific handling, then pass to common routine.
1812 	 */
1813 	switch (level) {
1814 	case IPPROTO_IP:
1815 		/*
1816 		 * Only allow IPv4 option processing on IPv4 sockets.
1817 		 */
1818 		if (connp->conn_family != AF_INET)
1819 			return (-1);
1820 
1821 		switch (name) {
1822 		case IP_OPTIONS:
1823 		case T_IP_OPTIONS:
1824 			/* Options are passed up with each packet */
1825 			return (0);
1826 		case IP_HDRINCL:
1827 			mutex_enter(&connp->conn_lock);
1828 			*i1 = (int)icmp->icmp_hdrincl;
1829 			mutex_exit(&connp->conn_lock);
1830 			return (sizeof (int));
1831 		}
1832 		break;
1833 
1834 	case IPPROTO_IPV6:
1835 		/*
1836 		 * Only allow IPv6 option processing on native IPv6 sockets.
1837 		 */
1838 		if (connp->conn_family != AF_INET6)
1839 			return (-1);
1840 
1841 		switch (name) {
1842 		case IPV6_CHECKSUM:
1843 			/*
1844 			 * Return offset or -1 if no checksum offset.
1845 			 * Does not apply to IPPROTO_ICMPV6
1846 			 */
1847 			if (connp->conn_proto == IPPROTO_ICMPV6)
1848 				return (-1);
1849 
1850 			mutex_enter(&connp->conn_lock);
1851 			if (connp->conn_ixa->ixa_flags & IXAF_SET_RAW_CKSUM)
1852 				*i1 = connp->conn_ixa->ixa_raw_cksum_offset;
1853 			else
1854 				*i1 = -1;
1855 			mutex_exit(&connp->conn_lock);
1856 			return (sizeof (int));
1857 		}
1858 		break;
1859 
1860 	case IPPROTO_ICMPV6:
1861 		/*
1862 		 * Only allow IPv6 option processing on native IPv6 sockets.
1863 		 */
1864 		if (connp->conn_family != AF_INET6)
1865 			return (-1);
1866 
1867 		if (connp->conn_proto != IPPROTO_ICMPV6)
1868 			return (-1);
1869 
1870 		switch (name) {
1871 		case ICMP6_FILTER:
1872 			mutex_enter(&connp->conn_lock);
1873 			if (icmp->icmp_filter == NULL) {
1874 				/* Make it look like "pass all" */
1875 				ICMP6_FILTER_SETPASSALL((icmp6_filter_t *)ptr);
1876 			} else {
1877 				(void) bcopy(icmp->icmp_filter, ptr,
1878 				    sizeof (icmp6_filter_t));
1879 			}
1880 			mutex_exit(&connp->conn_lock);
1881 			return (sizeof (icmp6_filter_t));
1882 		}
1883 	}
1884 	mutex_enter(&connp->conn_lock);
1885 	retval = conn_opt_get(&coas, level, name, ptr);
1886 	mutex_exit(&connp->conn_lock);
1887 	return (retval);
1888 }
1889 
1890 /*
1891  * This routine retrieves the current status of socket options.
1892  * It returns the size of the option retrieved, or -1.
1893  */
1894 int
1895 icmp_tpi_opt_get(queue_t *q, int level, int name, uchar_t *ptr)
1896 {
1897 	conn_t		*connp = Q_TO_CONN(q);
1898 	int 		err;
1899 
1900 	err = icmp_opt_get(connp, level, name, ptr);
1901 	return (err);
1902 }
1903 
1904 /*
1905  * This routine sets socket options.
1906  */
1907 int
1908 icmp_do_opt_set(conn_opt_arg_t *coa, int level, int name,
1909     uint_t inlen, uchar_t *invalp, cred_t *cr, boolean_t checkonly)
1910 {
1911 	conn_t		*connp = coa->coa_connp;
1912 	ip_xmit_attr_t	*ixa = coa->coa_ixa;
1913 	icmp_t		*icmp = connp->conn_icmp;
1914 	icmp_stack_t	*is = icmp->icmp_is;
1915 	int		*i1 = (int *)invalp;
1916 	boolean_t	onoff = (*i1 == 0) ? 0 : 1;
1917 	int		error;
1918 
1919 	ASSERT(MUTEX_NOT_HELD(&coa->coa_connp->conn_lock));
1920 
1921 	/*
1922 	 * For fixed length options, no sanity check
1923 	 * of passed in length is done. It is assumed *_optcom_req()
1924 	 * routines do the right thing.
1925 	 */
1926 
1927 	switch (level) {
1928 	case SOL_SOCKET:
1929 		switch (name) {
1930 		case SO_PROTOTYPE:
1931 			if ((*i1 & 0xFF) != IPPROTO_ICMP &&
1932 			    (*i1 & 0xFF) != IPPROTO_ICMPV6 &&
1933 			    secpolicy_net_rawaccess(cr) != 0) {
1934 				return (EACCES);
1935 			}
1936 			if (checkonly)
1937 				break;
1938 
1939 			mutex_enter(&connp->conn_lock);
1940 			connp->conn_proto = *i1 & 0xFF;
1941 			ixa->ixa_protocol = connp->conn_proto;
1942 			if ((connp->conn_proto == IPPROTO_RAW ||
1943 			    connp->conn_proto == IPPROTO_IGMP) &&
1944 			    connp->conn_family == AF_INET) {
1945 				icmp->icmp_hdrincl = 1;
1946 				ixa->ixa_flags &= ~IXAF_SET_ULP_CKSUM;
1947 			} else if (connp->conn_proto == IPPROTO_UDP ||
1948 			    connp->conn_proto == IPPROTO_TCP ||
1949 			    connp->conn_proto == IPPROTO_SCTP) {
1950 				/* Used by test applications like psh */
1951 				icmp->icmp_hdrincl = 0;
1952 				ixa->ixa_flags &= ~IXAF_SET_ULP_CKSUM;
1953 			} else {
1954 				icmp->icmp_hdrincl = 0;
1955 				ixa->ixa_flags |= IXAF_SET_ULP_CKSUM;
1956 			}
1957 
1958 			if (connp->conn_family == AF_INET6 &&
1959 			    connp->conn_proto == IPPROTO_ICMPV6) {
1960 				/* Set offset for icmp6_cksum */
1961 				ixa->ixa_flags &= ~IXAF_SET_RAW_CKSUM;
1962 				ixa->ixa_raw_cksum_offset = 2;
1963 			}
1964 			if (icmp->icmp_filter != NULL &&
1965 			    connp->conn_proto != IPPROTO_ICMPV6) {
1966 				kmem_free(icmp->icmp_filter,
1967 				    sizeof (icmp6_filter_t));
1968 				icmp->icmp_filter = NULL;
1969 			}
1970 			mutex_exit(&connp->conn_lock);
1971 
1972 			coa->coa_changed |= COA_HEADER_CHANGED;
1973 			/*
1974 			 * For SCTP, we don't use icmp_bind_proto() for
1975 			 * raw socket binding.
1976 			 */
1977 			if (connp->conn_proto == IPPROTO_SCTP)
1978 				return (0);
1979 
1980 			coa->coa_changed |= COA_ICMP_BIND_NEEDED;
1981 			return (0);
1982 
1983 		case SO_SNDBUF:
1984 			if (*i1 > is->is_max_buf) {
1985 				return (ENOBUFS);
1986 			}
1987 			break;
1988 		case SO_RCVBUF:
1989 			if (*i1 > is->is_max_buf) {
1990 				return (ENOBUFS);
1991 			}
1992 			break;
1993 		}
1994 		break;
1995 
1996 	case IPPROTO_IP:
1997 		/*
1998 		 * Only allow IPv4 option processing on IPv4 sockets.
1999 		 */
2000 		if (connp->conn_family != AF_INET)
2001 			return (EINVAL);
2002 
2003 		switch (name) {
2004 		case IP_HDRINCL:
2005 			if (!checkonly) {
2006 				mutex_enter(&connp->conn_lock);
2007 				icmp->icmp_hdrincl = onoff;
2008 				if (onoff)
2009 					ixa->ixa_flags &= ~IXAF_SET_ULP_CKSUM;
2010 				else
2011 					ixa->ixa_flags |= IXAF_SET_ULP_CKSUM;
2012 				mutex_exit(&connp->conn_lock);
2013 			}
2014 			break;
2015 		}
2016 		break;
2017 
2018 	case IPPROTO_IPV6:
2019 		if (connp->conn_family != AF_INET6)
2020 			return (EINVAL);
2021 
2022 		switch (name) {
2023 		case IPV6_CHECKSUM:
2024 			/*
2025 			 * Integer offset into the user data of where the
2026 			 * checksum is located.
2027 			 * Offset of -1 disables option.
2028 			 * Does not apply to IPPROTO_ICMPV6.
2029 			 */
2030 			if (connp->conn_proto == IPPROTO_ICMPV6 ||
2031 			    coa->coa_ancillary) {
2032 				return (EINVAL);
2033 			}
2034 			if ((*i1 != -1) && ((*i1 < 0) || (*i1 & 0x1) != 0)) {
2035 				/* Negative or not 16 bit aligned offset */
2036 				return (EINVAL);
2037 			}
2038 			if (checkonly)
2039 				break;
2040 
2041 			mutex_enter(&connp->conn_lock);
2042 			if (*i1 == -1) {
2043 				ixa->ixa_flags &= ~IXAF_SET_RAW_CKSUM;
2044 				ixa->ixa_raw_cksum_offset = 0;
2045 				ixa->ixa_flags &= ~IXAF_SET_ULP_CKSUM;
2046 			} else {
2047 				ixa->ixa_flags |= IXAF_SET_RAW_CKSUM;
2048 				ixa->ixa_raw_cksum_offset = *i1;
2049 				ixa->ixa_flags |= IXAF_SET_ULP_CKSUM;
2050 			}
2051 			mutex_exit(&connp->conn_lock);
2052 			break;
2053 		}
2054 		break;
2055 
2056 	case IPPROTO_ICMPV6:
2057 		/*
2058 		 * Only allow IPv6 option processing on IPv6 sockets.
2059 		 */
2060 		if (connp->conn_family != AF_INET6)
2061 			return (EINVAL);
2062 		if (connp->conn_proto != IPPROTO_ICMPV6)
2063 			return (EINVAL);
2064 
2065 		switch (name) {
2066 		case ICMP6_FILTER:
2067 			if (checkonly)
2068 				break;
2069 
2070 			if ((inlen != 0) &&
2071 			    (inlen != sizeof (icmp6_filter_t)))
2072 				return (EINVAL);
2073 
2074 			mutex_enter(&connp->conn_lock);
2075 			if (inlen == 0) {
2076 				if (icmp->icmp_filter != NULL) {
2077 					kmem_free(icmp->icmp_filter,
2078 					    sizeof (icmp6_filter_t));
2079 					icmp->icmp_filter = NULL;
2080 				}
2081 			} else {
2082 				if (icmp->icmp_filter == NULL) {
2083 					icmp->icmp_filter = kmem_alloc(
2084 					    sizeof (icmp6_filter_t),
2085 					    KM_NOSLEEP);
2086 					if (icmp->icmp_filter == NULL) {
2087 						mutex_exit(&connp->conn_lock);
2088 						return (ENOBUFS);
2089 					}
2090 				}
2091 				(void) bcopy(invalp, icmp->icmp_filter, inlen);
2092 			}
2093 			mutex_exit(&connp->conn_lock);
2094 			break;
2095 		}
2096 		break;
2097 	}
2098 	error = conn_opt_set(coa, level, name, inlen, invalp,
2099 	    checkonly, cr);
2100 	return (error);
2101 }
2102 
2103 /*
2104  * This routine sets socket options.
2105  */
2106 int
2107 icmp_opt_set(conn_t *connp, uint_t optset_context, int level, int name,
2108     uint_t inlen, uchar_t *invalp, uint_t *outlenp, uchar_t *outvalp,
2109     void *thisdg_attrs, cred_t *cr)
2110 {
2111 	icmp_t		*icmp = connp->conn_icmp;
2112 	int		err;
2113 	conn_opt_arg_t	coas, *coa;
2114 	boolean_t	checkonly;
2115 	icmp_stack_t	*is = icmp->icmp_is;
2116 
2117 	switch (optset_context) {
2118 	case SETFN_OPTCOM_CHECKONLY:
2119 		checkonly = B_TRUE;
2120 		/*
2121 		 * Note: Implies T_CHECK semantics for T_OPTCOM_REQ
2122 		 * inlen != 0 implies value supplied and
2123 		 * 	we have to "pretend" to set it.
2124 		 * inlen == 0 implies that there is no
2125 		 * 	value part in T_CHECK request and just validation
2126 		 * done elsewhere should be enough, we just return here.
2127 		 */
2128 		if (inlen == 0) {
2129 			*outlenp = 0;
2130 			return (0);
2131 		}
2132 		break;
2133 	case SETFN_OPTCOM_NEGOTIATE:
2134 		checkonly = B_FALSE;
2135 		break;
2136 	case SETFN_UD_NEGOTIATE:
2137 	case SETFN_CONN_NEGOTIATE:
2138 		checkonly = B_FALSE;
2139 		/*
2140 		 * Negotiating local and "association-related" options
2141 		 * through T_UNITDATA_REQ.
2142 		 *
2143 		 * Following routine can filter out ones we do not
2144 		 * want to be "set" this way.
2145 		 */
2146 		if (!icmp_opt_allow_udr_set(level, name)) {
2147 			*outlenp = 0;
2148 			return (EINVAL);
2149 		}
2150 		break;
2151 	default:
2152 		/*
2153 		 * We should never get here
2154 		 */
2155 		*outlenp = 0;
2156 		return (EINVAL);
2157 	}
2158 
2159 	ASSERT((optset_context != SETFN_OPTCOM_CHECKONLY) ||
2160 	    (optset_context == SETFN_OPTCOM_CHECKONLY && inlen != 0));
2161 
2162 	if (thisdg_attrs != NULL) {
2163 		/* Options from T_UNITDATA_REQ */
2164 		coa = (conn_opt_arg_t *)thisdg_attrs;
2165 		ASSERT(coa->coa_connp == connp);
2166 		ASSERT(coa->coa_ixa != NULL);
2167 		ASSERT(coa->coa_ipp != NULL);
2168 		ASSERT(coa->coa_ancillary);
2169 	} else {
2170 		coa = &coas;
2171 		coas.coa_connp = connp;
2172 		/* Get a reference on conn_ixa to prevent concurrent mods */
2173 		coas.coa_ixa = conn_get_ixa(connp, B_TRUE);
2174 		if (coas.coa_ixa == NULL) {
2175 			*outlenp = 0;
2176 			return (ENOMEM);
2177 		}
2178 		coas.coa_ipp = &connp->conn_xmit_ipp;
2179 		coas.coa_ancillary = B_FALSE;
2180 		coas.coa_changed = 0;
2181 	}
2182 
2183 	err = icmp_do_opt_set(coa, level, name, inlen, invalp,
2184 	    cr, checkonly);
2185 	if (err != 0) {
2186 errout:
2187 		if (!coa->coa_ancillary)
2188 			ixa_refrele(coa->coa_ixa);
2189 		*outlenp = 0;
2190 		return (err);
2191 	}
2192 
2193 	/*
2194 	 * Common case of OK return with outval same as inval.
2195 	 */
2196 	if (invalp != outvalp) {
2197 		/* don't trust bcopy for identical src/dst */
2198 		(void) bcopy(invalp, outvalp, inlen);
2199 	}
2200 	*outlenp = inlen;
2201 
2202 	/*
2203 	 * If this was not ancillary data, then we rebuild the headers,
2204 	 * update the IRE/NCE, and IPsec as needed.
2205 	 * Since the label depends on the destination we go through
2206 	 * ip_set_destination first.
2207 	 */
2208 	if (coa->coa_ancillary) {
2209 		return (0);
2210 	}
2211 
2212 	if (coa->coa_changed & COA_ROUTE_CHANGED) {
2213 		in6_addr_t saddr, faddr, nexthop;
2214 		in_port_t fport;
2215 
2216 		/*
2217 		 * We clear lastdst to make sure we pick up the change
2218 		 * next time sending.
2219 		 * If we are connected we re-cache the information.
2220 		 * We ignore errors to preserve BSD behavior.
2221 		 * Note that we don't redo IPsec policy lookup here
2222 		 * since the final destination (or source) didn't change.
2223 		 */
2224 		mutex_enter(&connp->conn_lock);
2225 		connp->conn_v6lastdst = ipv6_all_zeros;
2226 
2227 		ip_attr_nexthop(coa->coa_ipp, coa->coa_ixa,
2228 		    &connp->conn_faddr_v6, &nexthop);
2229 		saddr = connp->conn_saddr_v6;
2230 		faddr = connp->conn_faddr_v6;
2231 		fport = connp->conn_fport;
2232 		mutex_exit(&connp->conn_lock);
2233 
2234 		if (!IN6_IS_ADDR_UNSPECIFIED(&faddr) &&
2235 		    !IN6_IS_ADDR_V4MAPPED_ANY(&faddr)) {
2236 			(void) ip_attr_connect(connp, coa->coa_ixa,
2237 			    &saddr, &faddr, &nexthop, fport, NULL, NULL,
2238 			    IPDF_ALLOW_MCBC | IPDF_VERIFY_DST);
2239 		}
2240 	}
2241 
2242 	ixa_refrele(coa->coa_ixa);
2243 
2244 	if (coa->coa_changed & COA_HEADER_CHANGED) {
2245 		/*
2246 		 * Rebuild the header template if we are connected.
2247 		 * Otherwise clear conn_v6lastdst so we rebuild the header
2248 		 * in the data path.
2249 		 */
2250 		mutex_enter(&connp->conn_lock);
2251 		if (!IN6_IS_ADDR_UNSPECIFIED(&connp->conn_faddr_v6) &&
2252 		    !IN6_IS_ADDR_V4MAPPED_ANY(&connp->conn_faddr_v6)) {
2253 			err = icmp_build_hdr_template(connp,
2254 			    &connp->conn_saddr_v6, &connp->conn_faddr_v6,
2255 			    connp->conn_flowinfo);
2256 			if (err != 0) {
2257 				mutex_exit(&connp->conn_lock);
2258 				return (err);
2259 			}
2260 		} else {
2261 			connp->conn_v6lastdst = ipv6_all_zeros;
2262 		}
2263 		mutex_exit(&connp->conn_lock);
2264 	}
2265 	if (coa->coa_changed & COA_RCVBUF_CHANGED) {
2266 		(void) proto_set_rx_hiwat(connp->conn_rq, connp,
2267 		    connp->conn_rcvbuf);
2268 	}
2269 	if ((coa->coa_changed & COA_SNDBUF_CHANGED) && !IPCL_IS_NONSTR(connp)) {
2270 		connp->conn_wq->q_hiwat = connp->conn_sndbuf;
2271 	}
2272 	if (coa->coa_changed & COA_WROFF_CHANGED) {
2273 		/* Increase wroff if needed */
2274 		uint_t wroff;
2275 
2276 		mutex_enter(&connp->conn_lock);
2277 		wroff = connp->conn_ht_iphc_allocated + is->is_wroff_extra;
2278 		if (wroff > connp->conn_wroff) {
2279 			connp->conn_wroff = wroff;
2280 			mutex_exit(&connp->conn_lock);
2281 			(void) proto_set_tx_wroff(connp->conn_rq, connp, wroff);
2282 		} else {
2283 			mutex_exit(&connp->conn_lock);
2284 		}
2285 	}
2286 	if (coa->coa_changed & COA_ICMP_BIND_NEEDED) {
2287 		icmp_bind_proto(icmp);
2288 	}
2289 	return (err);
2290 }
2291 
2292 /* This routine sets socket options. */
2293 int
2294 icmp_tpi_opt_set(queue_t *q, uint_t optset_context, int level, int name,
2295     uint_t inlen, uchar_t *invalp, uint_t *outlenp, uchar_t *outvalp,
2296     void *thisdg_attrs, cred_t *cr)
2297 {
2298 	conn_t	*connp = Q_TO_CONN(q);
2299 	int error;
2300 
2301 	error = icmp_opt_set(connp, optset_context, level, name, inlen, invalp,
2302 	    outlenp, outvalp, thisdg_attrs, cr);
2303 	return (error);
2304 }
2305 
2306 /*
2307  * Setup IP headers.
2308  *
2309  * Note that IP_HDRINCL has ipha_protocol that is different than conn_proto,
2310  * but icmp_output_hdrincl restores ipha_protocol once we return.
2311  */
2312 mblk_t *
2313 icmp_prepend_hdr(conn_t *connp, ip_xmit_attr_t *ixa, const ip_pkt_t *ipp,
2314     const in6_addr_t *v6src, const in6_addr_t *v6dst, uint32_t flowinfo,
2315     mblk_t *data_mp, int *errorp)
2316 {
2317 	mblk_t		*mp;
2318 	icmp_stack_t	*is = connp->conn_netstack->netstack_icmp;
2319 	uint_t		data_len;
2320 	uint32_t	cksum;
2321 
2322 	data_len = msgdsize(data_mp);
2323 	mp = conn_prepend_hdr(ixa, ipp, v6src, v6dst, connp->conn_proto,
2324 	    flowinfo, 0, data_mp, data_len, is->is_wroff_extra, &cksum, errorp);
2325 	if (mp == NULL) {
2326 		ASSERT(*errorp != 0);
2327 		return (NULL);
2328 	}
2329 
2330 	ixa->ixa_pktlen = data_len + ixa->ixa_ip_hdr_length;
2331 
2332 	/*
2333 	 * If there was a routing option/header then conn_prepend_hdr
2334 	 * has massaged it and placed the pseudo-header checksum difference
2335 	 * in the cksum argument.
2336 	 *
2337 	 * Prepare for ICMPv6 checksum done in IP.
2338 	 *
2339 	 * We make it easy for IP to include our pseudo header
2340 	 * by putting our length (and any routing header adjustment)
2341 	 * in the ICMPv6 checksum field.
2342 	 * The IP source, destination, and length have already been set by
2343 	 * conn_prepend_hdr.
2344 	 */
2345 	cksum += data_len;
2346 	cksum = (cksum >> 16) + (cksum & 0xFFFF);
2347 	ASSERT(cksum < 0x10000);
2348 
2349 	if (ixa->ixa_flags & IXAF_IS_IPV4) {
2350 		ipha_t	*ipha = (ipha_t *)mp->b_rptr;
2351 
2352 		ASSERT(ntohs(ipha->ipha_length) == ixa->ixa_pktlen);
2353 	} else {
2354 		ip6_t	*ip6h = (ip6_t *)mp->b_rptr;
2355 		uint_t	cksum_offset = 0;
2356 
2357 		ASSERT(ntohs(ip6h->ip6_plen) + IPV6_HDR_LEN == ixa->ixa_pktlen);
2358 
2359 		if (ixa->ixa_flags & IXAF_SET_ULP_CKSUM) {
2360 			if (connp->conn_proto == IPPROTO_ICMPV6) {
2361 				cksum_offset = ixa->ixa_ip_hdr_length +
2362 				    offsetof(icmp6_t, icmp6_cksum);
2363 			} else if (ixa->ixa_flags & IXAF_SET_RAW_CKSUM) {
2364 				cksum_offset = ixa->ixa_ip_hdr_length +
2365 				    ixa->ixa_raw_cksum_offset;
2366 			}
2367 		}
2368 		if (cksum_offset != 0) {
2369 			uint16_t *ptr;
2370 
2371 			/* Make sure the checksum fits in the first mblk */
2372 			if (cksum_offset + sizeof (short) > MBLKL(mp)) {
2373 				mblk_t *mp1;
2374 
2375 				mp1 = msgpullup(mp,
2376 				    cksum_offset + sizeof (short));
2377 				freemsg(mp);
2378 				if (mp1 == NULL) {
2379 					*errorp = ENOMEM;
2380 					return (NULL);
2381 				}
2382 				mp = mp1;
2383 				ip6h = (ip6_t *)mp->b_rptr;
2384 			}
2385 			ptr = (uint16_t *)(mp->b_rptr + cksum_offset);
2386 			*ptr = htons(cksum);
2387 		}
2388 	}
2389 
2390 	/* Note that we don't try to update wroff due to ancillary data */
2391 	return (mp);
2392 }
2393 
2394 static int
2395 icmp_build_hdr_template(conn_t *connp, const in6_addr_t *v6src,
2396     const in6_addr_t *v6dst, uint32_t flowinfo)
2397 {
2398 	int		error;
2399 
2400 	ASSERT(MUTEX_HELD(&connp->conn_lock));
2401 	/*
2402 	 * We clear lastdst to make sure we don't use the lastdst path
2403 	 * next time sending since we might not have set v6dst yet.
2404 	 */
2405 	connp->conn_v6lastdst = ipv6_all_zeros;
2406 
2407 	error = conn_build_hdr_template(connp, 0, 0, v6src, v6dst, flowinfo);
2408 	if (error != 0)
2409 		return (error);
2410 
2411 	/*
2412 	 * Any routing header/option has been massaged. The checksum difference
2413 	 * is stored in conn_sum.
2414 	 */
2415 	return (0);
2416 }
2417 
2418 /*
2419  * This routine retrieves the value of an ND variable in a icmpparam_t
2420  * structure.  It is called through nd_getset when a user reads the
2421  * variable.
2422  */
2423 /* ARGSUSED */
2424 static int
2425 icmp_param_get(queue_t *q, mblk_t *mp, caddr_t cp, cred_t *cr)
2426 {
2427 	icmpparam_t	*icmppa = (icmpparam_t *)cp;
2428 
2429 	(void) mi_mpprintf(mp, "%d", icmppa->icmp_param_value);
2430 	return (0);
2431 }
2432 
2433 /*
2434  * Walk through the param array specified registering each element with the
2435  * named dispatch (ND) handler.
2436  */
2437 static boolean_t
2438 icmp_param_register(IDP *ndp, icmpparam_t *icmppa, int cnt)
2439 {
2440 	for (; cnt-- > 0; icmppa++) {
2441 		if (icmppa->icmp_param_name && icmppa->icmp_param_name[0]) {
2442 			if (!nd_load(ndp, icmppa->icmp_param_name,
2443 			    icmp_param_get, icmp_param_set,
2444 			    (caddr_t)icmppa)) {
2445 				nd_free(ndp);
2446 				return (B_FALSE);
2447 			}
2448 		}
2449 	}
2450 	return (B_TRUE);
2451 }
2452 
2453 /* This routine sets an ND variable in a icmpparam_t structure. */
2454 /* ARGSUSED */
2455 static int
2456 icmp_param_set(queue_t *q, mblk_t *mp, char *value, caddr_t cp, cred_t *cr)
2457 {
2458 	long		new_value;
2459 	icmpparam_t	*icmppa = (icmpparam_t *)cp;
2460 
2461 	/*
2462 	 * Fail the request if the new value does not lie within the
2463 	 * required bounds.
2464 	 */
2465 	if (ddi_strtol(value, NULL, 10, &new_value) != 0 ||
2466 	    new_value < icmppa->icmp_param_min ||
2467 	    new_value > icmppa->icmp_param_max) {
2468 		return (EINVAL);
2469 	}
2470 	/* Set the new value */
2471 	icmppa->icmp_param_value = new_value;
2472 	return (0);
2473 }
2474 
2475 static mblk_t *
2476 icmp_queue_fallback(icmp_t *icmp, mblk_t *mp)
2477 {
2478 	ASSERT(MUTEX_HELD(&icmp->icmp_recv_lock));
2479 	if (IPCL_IS_NONSTR(icmp->icmp_connp)) {
2480 		/*
2481 		 * fallback has started but messages have not been moved yet
2482 		 */
2483 		if (icmp->icmp_fallback_queue_head == NULL) {
2484 			ASSERT(icmp->icmp_fallback_queue_tail == NULL);
2485 			icmp->icmp_fallback_queue_head = mp;
2486 			icmp->icmp_fallback_queue_tail = mp;
2487 		} else {
2488 			ASSERT(icmp->icmp_fallback_queue_tail != NULL);
2489 			icmp->icmp_fallback_queue_tail->b_next = mp;
2490 			icmp->icmp_fallback_queue_tail = mp;
2491 		}
2492 		return (NULL);
2493 	} else {
2494 		/*
2495 		 * Fallback completed, let the caller putnext() the mblk.
2496 		 */
2497 		return (mp);
2498 	}
2499 }
2500 
2501 /*
2502  * Deliver data to ULP. In case we have a socket, and it's falling back to
2503  * TPI, then we'll queue the mp for later processing.
2504  */
2505 static void
2506 icmp_ulp_recv(conn_t *connp, mblk_t *mp, uint_t len)
2507 {
2508 	if (IPCL_IS_NONSTR(connp)) {
2509 		icmp_t *icmp = connp->conn_icmp;
2510 		int error;
2511 
2512 		ASSERT(len == msgdsize(mp));
2513 		if ((*connp->conn_upcalls->su_recv)
2514 		    (connp->conn_upper_handle, mp, len, 0, &error, NULL) < 0) {
2515 			mutex_enter(&icmp->icmp_recv_lock);
2516 			if (error == ENOSPC) {
2517 				/*
2518 				 * let's confirm while holding the lock
2519 				 */
2520 				if ((*connp->conn_upcalls->su_recv)
2521 				    (connp->conn_upper_handle, NULL, 0, 0,
2522 				    &error, NULL) < 0) {
2523 					ASSERT(error == ENOSPC);
2524 					if (error == ENOSPC) {
2525 						connp->conn_flow_cntrld =
2526 						    B_TRUE;
2527 					}
2528 				}
2529 				mutex_exit(&icmp->icmp_recv_lock);
2530 			} else {
2531 				ASSERT(error == EOPNOTSUPP);
2532 				mp = icmp_queue_fallback(icmp, mp);
2533 				mutex_exit(&icmp->icmp_recv_lock);
2534 				if (mp != NULL)
2535 					putnext(connp->conn_rq, mp);
2536 			}
2537 		}
2538 		ASSERT(MUTEX_NOT_HELD(&icmp->icmp_recv_lock));
2539 	} else {
2540 		putnext(connp->conn_rq, mp);
2541 	}
2542 }
2543 
2544 /*
2545  * This is the inbound data path.
2546  * IP has already pulled up the IP headers and verified alignment
2547  * etc.
2548  */
2549 /* ARGSUSED2 */
2550 static void
2551 icmp_input(void *arg1, mblk_t *mp, void *arg2, ip_recv_attr_t *ira)
2552 {
2553 	conn_t			*connp = (conn_t *)arg1;
2554 	struct T_unitdata_ind	*tudi;
2555 	uchar_t			*rptr;		/* Pointer to IP header */
2556 	int			ip_hdr_length;
2557 	int			udi_size;	/* Size of T_unitdata_ind */
2558 	int			pkt_len;
2559 	icmp_t			*icmp;
2560 	ip_pkt_t		ipps;
2561 	ip6_t			*ip6h;
2562 	mblk_t			*mp1;
2563 	crb_t			recv_ancillary;
2564 	icmp_stack_t		*is;
2565 	sin_t			*sin;
2566 	sin6_t			*sin6;
2567 	ipha_t			*ipha;
2568 
2569 	ASSERT(connp->conn_flags & IPCL_RAWIPCONN);
2570 
2571 	icmp = connp->conn_icmp;
2572 	is = icmp->icmp_is;
2573 	rptr = mp->b_rptr;
2574 
2575 	ASSERT(DB_TYPE(mp) == M_DATA);
2576 	ASSERT(OK_32PTR(rptr));
2577 	ASSERT(ira->ira_pktlen == msgdsize(mp));
2578 	pkt_len = ira->ira_pktlen;
2579 
2580 	/*
2581 	 * Get a snapshot of these and allow other threads to change
2582 	 * them after that. We need the same recv_ancillary when determining
2583 	 * the size as when adding the ancillary data items.
2584 	 */
2585 	mutex_enter(&connp->conn_lock);
2586 	recv_ancillary = connp->conn_recv_ancillary;
2587 	mutex_exit(&connp->conn_lock);
2588 
2589 	ip_hdr_length = ira->ira_ip_hdr_length;
2590 	ASSERT(MBLKL(mp) >= ip_hdr_length);	/* IP did a pullup */
2591 
2592 	/* Initialize regardless of IP version */
2593 	ipps.ipp_fields = 0;
2594 
2595 	if (ira->ira_flags & IRAF_IS_IPV4) {
2596 		ASSERT(IPH_HDR_VERSION(rptr) == IPV4_VERSION);
2597 		ASSERT(MBLKL(mp) >= sizeof (ipha_t));
2598 		ASSERT(ira->ira_ip_hdr_length == IPH_HDR_LENGTH(rptr));
2599 
2600 		ipha = (ipha_t *)mp->b_rptr;
2601 		if (recv_ancillary.crb_all != 0)
2602 			(void) ip_find_hdr_v4(ipha, &ipps, B_FALSE);
2603 
2604 		/*
2605 		 * BSD for some reason adjusts ipha_length to exclude the
2606 		 * IP header length. We do the same.
2607 		 */
2608 		if (is->is_bsd_compat) {
2609 			ushort_t len;
2610 
2611 			len = ntohs(ipha->ipha_length);
2612 			if (mp->b_datap->db_ref > 1) {
2613 				/*
2614 				 * Allocate a new IP header so that we can
2615 				 * modify ipha_length.
2616 				 */
2617 				mblk_t	*mp1;
2618 
2619 				mp1 = allocb(ip_hdr_length, BPRI_MED);
2620 				if (mp1 == NULL) {
2621 					freemsg(mp);
2622 					BUMP_MIB(&is->is_rawip_mib,
2623 					    rawipInErrors);
2624 					return;
2625 				}
2626 				bcopy(rptr, mp1->b_rptr, ip_hdr_length);
2627 				mp->b_rptr = rptr + ip_hdr_length;
2628 				rptr = mp1->b_rptr;
2629 				ipha = (ipha_t *)rptr;
2630 				mp1->b_cont = mp;
2631 				mp1->b_wptr = rptr + ip_hdr_length;
2632 				mp = mp1;
2633 			}
2634 			len -= ip_hdr_length;
2635 			ipha->ipha_length = htons(len);
2636 		}
2637 
2638 		/*
2639 		 * For RAW sockets we not pass ICMP/IPv4 packets to AF_INET6
2640 		 * sockets. This is ensured by icmp_bind and the IP fanout code.
2641 		 */
2642 		ASSERT(connp->conn_family == AF_INET);
2643 
2644 		/*
2645 		 * This is the inbound data path.  Packets are passed upstream
2646 		 * as T_UNITDATA_IND messages with full IPv4 headers still
2647 		 * attached.
2648 		 */
2649 
2650 		/*
2651 		 * Normally only send up the source address.
2652 		 * If any ancillary data items are wanted we add those.
2653 		 */
2654 		udi_size = sizeof (struct T_unitdata_ind) + sizeof (sin_t);
2655 		if (recv_ancillary.crb_all != 0) {
2656 			udi_size += conn_recvancillary_size(connp,
2657 			    recv_ancillary, ira, mp, &ipps);
2658 		}
2659 
2660 		/* Allocate a message block for the T_UNITDATA_IND structure. */
2661 		mp1 = allocb(udi_size, BPRI_MED);
2662 		if (mp1 == NULL) {
2663 			freemsg(mp);
2664 			BUMP_MIB(&is->is_rawip_mib, rawipInErrors);
2665 			return;
2666 		}
2667 		mp1->b_cont = mp;
2668 		tudi = (struct T_unitdata_ind *)mp1->b_rptr;
2669 		mp1->b_datap->db_type = M_PROTO;
2670 		mp1->b_wptr = (uchar_t *)tudi + udi_size;
2671 		tudi->PRIM_type = T_UNITDATA_IND;
2672 		tudi->SRC_length = sizeof (sin_t);
2673 		tudi->SRC_offset = sizeof (struct T_unitdata_ind);
2674 		sin = (sin_t *)&tudi[1];
2675 		*sin = sin_null;
2676 		sin->sin_family = AF_INET;
2677 		sin->sin_addr.s_addr = ipha->ipha_src;
2678 		*(uint32_t *)&sin->sin_zero[0] = 0;
2679 		*(uint32_t *)&sin->sin_zero[4] = 0;
2680 		tudi->OPT_offset =  sizeof (struct T_unitdata_ind) +
2681 		    sizeof (sin_t);
2682 		udi_size -= (sizeof (struct T_unitdata_ind) + sizeof (sin_t));
2683 		tudi->OPT_length = udi_size;
2684 
2685 		/*
2686 		 * Add options if IP_RECVIF etc is set
2687 		 */
2688 		if (udi_size != 0) {
2689 			conn_recvancillary_add(connp, recv_ancillary, ira,
2690 			    &ipps, (uchar_t *)&sin[1], udi_size);
2691 		}
2692 		goto deliver;
2693 	}
2694 
2695 	ASSERT(IPH_HDR_VERSION(rptr) == IPV6_VERSION);
2696 	/*
2697 	 * IPv6 packets can only be received by applications
2698 	 * that are prepared to receive IPv6 addresses.
2699 	 * The IP fanout must ensure this.
2700 	 */
2701 	ASSERT(connp->conn_family == AF_INET6);
2702 
2703 	/*
2704 	 * Handle IPv6 packets. We don't pass up the IP headers with the
2705 	 * payload for IPv6.
2706 	 */
2707 
2708 	ip6h = (ip6_t *)rptr;
2709 	if (recv_ancillary.crb_all != 0) {
2710 		/*
2711 		 * Call on ip_find_hdr_v6 which gets individual lenghts of
2712 		 * extension headers (and pointers to them).
2713 		 */
2714 		uint8_t		nexthdr;
2715 
2716 		/* We don't care about the length or nextheader. */
2717 		(void) ip_find_hdr_v6(mp, ip6h, B_TRUE, &ipps, &nexthdr);
2718 
2719 		/*
2720 		 * We do not pass up hop-by-hop options or any other
2721 		 * extension header as part of the packet. Applications
2722 		 * that want to see them have to specify IPV6_RECV* socket
2723 		 * options. And conn_recvancillary_size/add explicitly
2724 		 * drops the TX option from IPV6_HOPOPTS as it does for UDP.
2725 		 *
2726 		 * If we had multilevel ICMP sockets, then we'd want to
2727 		 * modify conn_recvancillary_size/add to
2728 		 * allow the user to see the label.
2729 		 */
2730 	}
2731 
2732 	/*
2733 	 * Check a filter for ICMPv6 types if needed.
2734 	 * Verify raw checksums if needed.
2735 	 */
2736 	mutex_enter(&connp->conn_lock);
2737 	if (icmp->icmp_filter != NULL) {
2738 		int type;
2739 
2740 		/* Assumes that IP has done the pullupmsg */
2741 		type = mp->b_rptr[ip_hdr_length];
2742 
2743 		ASSERT(mp->b_rptr + ip_hdr_length <= mp->b_wptr);
2744 		if (ICMP6_FILTER_WILLBLOCK(type, icmp->icmp_filter)) {
2745 			mutex_exit(&connp->conn_lock);
2746 			freemsg(mp);
2747 			return;
2748 		}
2749 	}
2750 	if (connp->conn_ixa->ixa_flags & IXAF_SET_RAW_CKSUM) {
2751 		/* Checksum */
2752 		uint16_t	*up;
2753 		uint32_t	sum;
2754 		int		remlen;
2755 
2756 		up = (uint16_t *)&ip6h->ip6_src;
2757 
2758 		remlen = msgdsize(mp) - ip_hdr_length;
2759 		sum = htons(connp->conn_proto + remlen)
2760 		    + up[0] + up[1] + up[2] + up[3]
2761 		    + up[4] + up[5] + up[6] + up[7]
2762 		    + up[8] + up[9] + up[10] + up[11]
2763 		    + up[12] + up[13] + up[14] + up[15];
2764 		sum = (sum & 0xffff) + (sum >> 16);
2765 		sum = IP_CSUM(mp, ip_hdr_length, sum);
2766 		if (sum != 0) {
2767 			/* IPv6 RAW checksum failed */
2768 			ip0dbg(("icmp_rput: RAW checksum failed %x\n", sum));
2769 			mutex_exit(&connp->conn_lock);
2770 			freemsg(mp);
2771 			BUMP_MIB(&is->is_rawip_mib, rawipInCksumErrs);
2772 			return;
2773 		}
2774 	}
2775 	mutex_exit(&connp->conn_lock);
2776 
2777 	udi_size = sizeof (struct T_unitdata_ind) + sizeof (sin6_t);
2778 
2779 	if (recv_ancillary.crb_all != 0) {
2780 		udi_size += conn_recvancillary_size(connp,
2781 		    recv_ancillary, ira, mp, &ipps);
2782 	}
2783 
2784 	mp1 = allocb(udi_size, BPRI_MED);
2785 	if (mp1 == NULL) {
2786 		freemsg(mp);
2787 		BUMP_MIB(&is->is_rawip_mib, rawipInErrors);
2788 		return;
2789 	}
2790 	mp1->b_cont = mp;
2791 	mp1->b_datap->db_type = M_PROTO;
2792 	tudi = (struct T_unitdata_ind *)mp1->b_rptr;
2793 	mp1->b_wptr = (uchar_t *)tudi + udi_size;
2794 	tudi->PRIM_type = T_UNITDATA_IND;
2795 	tudi->SRC_length = sizeof (sin6_t);
2796 	tudi->SRC_offset = sizeof (struct T_unitdata_ind);
2797 	tudi->OPT_offset = sizeof (struct T_unitdata_ind) + sizeof (sin6_t);
2798 	udi_size -= (sizeof (struct T_unitdata_ind) + sizeof (sin6_t));
2799 	tudi->OPT_length = udi_size;
2800 	sin6 = (sin6_t *)&tudi[1];
2801 	*sin6 = sin6_null;
2802 	sin6->sin6_port = 0;
2803 	sin6->sin6_family = AF_INET6;
2804 
2805 	sin6->sin6_addr = ip6h->ip6_src;
2806 	/* No sin6_flowinfo per API */
2807 	sin6->sin6_flowinfo = 0;
2808 	/* For link-scope pass up scope id */
2809 	if (IN6_IS_ADDR_LINKSCOPE(&ip6h->ip6_src))
2810 		sin6->sin6_scope_id = ira->ira_ruifindex;
2811 	else
2812 		sin6->sin6_scope_id = 0;
2813 	sin6->__sin6_src_id = ip_srcid_find_addr(&ip6h->ip6_dst,
2814 	    IPCL_ZONEID(connp), is->is_netstack);
2815 
2816 	if (udi_size != 0) {
2817 		conn_recvancillary_add(connp, recv_ancillary, ira,
2818 		    &ipps, (uchar_t *)&sin6[1], udi_size);
2819 	}
2820 
2821 	/* Skip all the IPv6 headers per API */
2822 	mp->b_rptr += ip_hdr_length;
2823 	pkt_len -= ip_hdr_length;
2824 
2825 deliver:
2826 	BUMP_MIB(&is->is_rawip_mib, rawipInDatagrams);
2827 	icmp_ulp_recv(connp, mp1, pkt_len);
2828 }
2829 
2830 /*
2831  * return SNMP stuff in buffer in mpdata. We don't hold any lock and report
2832  * information that can be changing beneath us.
2833  */
2834 mblk_t *
2835 icmp_snmp_get(queue_t *q, mblk_t *mpctl)
2836 {
2837 	mblk_t			*mpdata;
2838 	struct opthdr		*optp;
2839 	conn_t			*connp = Q_TO_CONN(q);
2840 	icmp_stack_t		*is = connp->conn_netstack->netstack_icmp;
2841 	mblk_t			*mp2ctl;
2842 
2843 	/*
2844 	 * make a copy of the original message
2845 	 */
2846 	mp2ctl = copymsg(mpctl);
2847 
2848 	if (mpctl == NULL ||
2849 	    (mpdata = mpctl->b_cont) == NULL) {
2850 		freemsg(mpctl);
2851 		freemsg(mp2ctl);
2852 		return (0);
2853 	}
2854 
2855 	/* fixed length structure for IPv4 and IPv6 counters */
2856 	optp = (struct opthdr *)&mpctl->b_rptr[sizeof (struct T_optmgmt_ack)];
2857 	optp->level = EXPER_RAWIP;
2858 	optp->name = 0;
2859 	(void) snmp_append_data(mpdata, (char *)&is->is_rawip_mib,
2860 	    sizeof (is->is_rawip_mib));
2861 	optp->len = msgdsize(mpdata);
2862 	qreply(q, mpctl);
2863 
2864 	return (mp2ctl);
2865 }
2866 
2867 /*
2868  * Return 0 if invalid set request, 1 otherwise, including non-rawip requests.
2869  * TODO:  If this ever actually tries to set anything, it needs to be
2870  * to do the appropriate locking.
2871  */
2872 /* ARGSUSED */
2873 int
2874 icmp_snmp_set(queue_t *q, t_scalar_t level, t_scalar_t name,
2875     uchar_t *ptr, int len)
2876 {
2877 	switch (level) {
2878 	case EXPER_RAWIP:
2879 		return (0);
2880 	default:
2881 		return (1);
2882 	}
2883 }
2884 
2885 /*
2886  * This routine creates a T_UDERROR_IND message and passes it upstream.
2887  * The address and options are copied from the T_UNITDATA_REQ message
2888  * passed in mp.  This message is freed.
2889  */
2890 static void
2891 icmp_ud_err(queue_t *q, mblk_t *mp, t_scalar_t err)
2892 {
2893 	struct T_unitdata_req *tudr;
2894 	mblk_t	*mp1;
2895 	uchar_t *destaddr;
2896 	t_scalar_t destlen;
2897 	uchar_t	*optaddr;
2898 	t_scalar_t optlen;
2899 
2900 	if ((mp->b_wptr < mp->b_rptr) ||
2901 	    (MBLKL(mp)) < sizeof (struct T_unitdata_req)) {
2902 		goto done;
2903 	}
2904 	tudr = (struct T_unitdata_req *)mp->b_rptr;
2905 	destaddr = mp->b_rptr + tudr->DEST_offset;
2906 	if (destaddr < mp->b_rptr || destaddr >= mp->b_wptr ||
2907 	    destaddr + tudr->DEST_length < mp->b_rptr ||
2908 	    destaddr + tudr->DEST_length > mp->b_wptr) {
2909 		goto done;
2910 	}
2911 	optaddr = mp->b_rptr + tudr->OPT_offset;
2912 	if (optaddr < mp->b_rptr || optaddr >= mp->b_wptr ||
2913 	    optaddr + tudr->OPT_length < mp->b_rptr ||
2914 	    optaddr + tudr->OPT_length > mp->b_wptr) {
2915 		goto done;
2916 	}
2917 	destlen = tudr->DEST_length;
2918 	optlen = tudr->OPT_length;
2919 
2920 	mp1 = mi_tpi_uderror_ind((char *)destaddr, destlen,
2921 	    (char *)optaddr, optlen, err);
2922 	if (mp1 != NULL)
2923 		qreply(q, mp1);
2924 
2925 done:
2926 	freemsg(mp);
2927 }
2928 
2929 static int
2930 rawip_do_unbind(conn_t *connp)
2931 {
2932 	icmp_t	*icmp = connp->conn_icmp;
2933 
2934 	mutex_enter(&connp->conn_lock);
2935 	/* If a bind has not been done, we can't unbind. */
2936 	if (icmp->icmp_state == TS_UNBND) {
2937 		mutex_exit(&connp->conn_lock);
2938 		return (-TOUTSTATE);
2939 	}
2940 	connp->conn_saddr_v6 = ipv6_all_zeros;
2941 	connp->conn_bound_addr_v6 = ipv6_all_zeros;
2942 	connp->conn_laddr_v6 = ipv6_all_zeros;
2943 	connp->conn_mcbc_bind = B_FALSE;
2944 	connp->conn_lport = 0;
2945 	connp->conn_fport = 0;
2946 	/* In case we were also connected */
2947 	connp->conn_faddr_v6 = ipv6_all_zeros;
2948 	connp->conn_v6lastdst = ipv6_all_zeros;
2949 
2950 	icmp->icmp_state = TS_UNBND;
2951 
2952 	(void) icmp_build_hdr_template(connp, &connp->conn_saddr_v6,
2953 	    &connp->conn_faddr_v6, connp->conn_flowinfo);
2954 	mutex_exit(&connp->conn_lock);
2955 
2956 	ip_unbind(connp);
2957 	return (0);
2958 }
2959 
2960 /*
2961  * This routine is called by icmp_wput to handle T_UNBIND_REQ messages.
2962  * After some error checking, the message is passed downstream to ip.
2963  */
2964 static void
2965 icmp_tpi_unbind(queue_t *q, mblk_t *mp)
2966 {
2967 	conn_t	*connp = Q_TO_CONN(q);
2968 	int	error;
2969 
2970 	ASSERT(mp->b_cont == NULL);
2971 	error = rawip_do_unbind(connp);
2972 	if (error) {
2973 		if (error < 0) {
2974 			icmp_err_ack(q, mp, -error, 0);
2975 		} else {
2976 			icmp_err_ack(q, mp, 0, error);
2977 		}
2978 		return;
2979 	}
2980 
2981 	/*
2982 	 * Convert mp into a T_OK_ACK
2983 	 */
2984 
2985 	mp = mi_tpi_ok_ack_alloc(mp);
2986 
2987 	/*
2988 	 * should not happen in practice... T_OK_ACK is smaller than the
2989 	 * original message.
2990 	 */
2991 	ASSERT(mp != NULL);
2992 	ASSERT(((struct T_ok_ack *)mp->b_rptr)->PRIM_type == T_OK_ACK);
2993 	qreply(q, mp);
2994 }
2995 
2996 /*
2997  * Process IPv4 packets that already include an IP header.
2998  * Used when IP_HDRINCL has been set (implicit for IPPROTO_RAW and
2999  * IPPROTO_IGMP).
3000  * In this case we ignore the address and any options in the T_UNITDATA_REQ.
3001  *
3002  * The packet is assumed to have a base (20 byte) IP header followed
3003  * by the upper-layer protocol. We include any IP_OPTIONS including a
3004  * CIPSO label but otherwise preserve the base IP header.
3005  */
3006 static int
3007 icmp_output_hdrincl(conn_t *connp, mblk_t *mp, cred_t *cr, pid_t pid)
3008 {
3009 	icmp_t		*icmp = connp->conn_icmp;
3010 	icmp_stack_t	*is = icmp->icmp_is;
3011 	ipha_t		iphas;
3012 	ipha_t		*ipha;
3013 	int		ip_hdr_length;
3014 	int		tp_hdr_len;
3015 	ip_xmit_attr_t	*ixa;
3016 	ip_pkt_t	*ipp;
3017 	in6_addr_t	v6src;
3018 	in6_addr_t	v6dst;
3019 	in6_addr_t	v6nexthop;
3020 	int		error;
3021 	boolean_t	do_ipsec;
3022 
3023 	/*
3024 	 * We need an exclusive copy of conn_ixa since the included IP
3025 	 * header could have any destination.
3026 	 * That copy has no pointers hence we
3027 	 * need to set them up once we've parsed the ancillary data.
3028 	 */
3029 	ixa = conn_get_ixa_exclusive(connp);
3030 	if (ixa == NULL) {
3031 		BUMP_MIB(&is->is_rawip_mib, rawipOutErrors);
3032 		freemsg(mp);
3033 		return (ENOMEM);
3034 	}
3035 	ASSERT(cr != NULL);
3036 	/*
3037 	 * Caller has a reference on cr; from db_credp or because we
3038 	 * are running in process context.
3039 	 */
3040 	ixa->ixa_cred = cr;
3041 	ixa->ixa_cpid = pid;
3042 	if (is_system_labeled()) {
3043 		/* We need to restart with a label based on the cred */
3044 		ip_xmit_attr_restore_tsl(ixa, ixa->ixa_cred);
3045 	}
3046 
3047 	/* In case previous destination was multicast or multirt */
3048 	ip_attr_newdst(ixa);
3049 
3050 	/* Get a copy of conn_xmit_ipp since the TX label might change it */
3051 	ipp = kmem_zalloc(sizeof (*ipp), KM_NOSLEEP);
3052 	if (ipp == NULL) {
3053 		ixa_refrele(ixa);
3054 		BUMP_MIB(&is->is_rawip_mib, rawipOutErrors);
3055 		freemsg(mp);
3056 		return (ENOMEM);
3057 	}
3058 	mutex_enter(&connp->conn_lock);
3059 	error = ip_pkt_copy(&connp->conn_xmit_ipp, ipp, KM_NOSLEEP);
3060 	mutex_exit(&connp->conn_lock);
3061 	if (error != 0) {
3062 		BUMP_MIB(&is->is_rawip_mib, rawipOutErrors);
3063 		freemsg(mp);
3064 		goto done;
3065 	}
3066 
3067 	/* Sanity check length of packet */
3068 	ipha = (ipha_t *)mp->b_rptr;
3069 
3070 	ip_hdr_length = IP_SIMPLE_HDR_LENGTH;
3071 	if ((mp->b_wptr - mp->b_rptr) < IP_SIMPLE_HDR_LENGTH) {
3072 		if (!pullupmsg(mp, IP_SIMPLE_HDR_LENGTH)) {
3073 			BUMP_MIB(&is->is_rawip_mib, rawipOutErrors);
3074 			freemsg(mp);
3075 			goto done;
3076 		}
3077 		ipha = (ipha_t *)mp->b_rptr;
3078 	}
3079 	ipha->ipha_version_and_hdr_length =
3080 	    (IP_VERSION<<4) | (ip_hdr_length>>2);
3081 
3082 	/*
3083 	 * We set IXAF_DONTFRAG if the application set DF which makes
3084 	 * IP not fragment.
3085 	 */
3086 	ipha->ipha_fragment_offset_and_flags &= htons(IPH_DF);
3087 	if (ipha->ipha_fragment_offset_and_flags & htons(IPH_DF))
3088 		ixa->ixa_flags |= (IXAF_DONTFRAG | IXAF_PMTU_IPV4_DF);
3089 	else
3090 		ixa->ixa_flags &= ~(IXAF_DONTFRAG | IXAF_PMTU_IPV4_DF);
3091 
3092 	/* Even for multicast and broadcast we honor the apps ttl */
3093 	ixa->ixa_flags |= IXAF_NO_TTL_CHANGE;
3094 
3095 	if (ipha->ipha_dst == INADDR_ANY)
3096 		ipha->ipha_dst = htonl(INADDR_LOOPBACK);
3097 
3098 	IN6_IPADDR_TO_V4MAPPED(ipha->ipha_src, &v6src);
3099 	IN6_IPADDR_TO_V4MAPPED(ipha->ipha_dst, &v6dst);
3100 
3101 	/* Defer IPsec if it might need to look at ICMP type/code */
3102 	do_ipsec = ipha->ipha_protocol != IPPROTO_ICMP;
3103 	ixa->ixa_flags |= IXAF_IS_IPV4;
3104 
3105 	ip_attr_nexthop(ipp, ixa, &v6dst, &v6nexthop);
3106 	error = ip_attr_connect(connp, ixa, &v6src, &v6dst, &v6nexthop,
3107 	    connp->conn_fport, &v6src, NULL, IPDF_ALLOW_MCBC | IPDF_VERIFY_DST |
3108 	    (do_ipsec ? IPDF_IPSEC : 0));
3109 	switch (error) {
3110 	case 0:
3111 		break;
3112 	case EADDRNOTAVAIL:
3113 		/*
3114 		 * IXAF_VERIFY_SOURCE tells us to pick a better source.
3115 		 * Don't have the application see that errno
3116 		 */
3117 		error = ENETUNREACH;
3118 		goto failed;
3119 	case ENETDOWN:
3120 		/*
3121 		 * Have !ipif_addr_ready address; drop packet silently
3122 		 * until we can get applications to not send until we
3123 		 * are ready.
3124 		 */
3125 		error = 0;
3126 		goto failed;
3127 	case EHOSTUNREACH:
3128 	case ENETUNREACH:
3129 		if (ixa->ixa_ire != NULL) {
3130 			/*
3131 			 * Let conn_ip_output/ire_send_noroute return
3132 			 * the error and send any local ICMP error.
3133 			 */
3134 			error = 0;
3135 			break;
3136 		}
3137 		/* FALLTHRU */
3138 	default:
3139 	failed:
3140 		freemsg(mp);
3141 		BUMP_MIB(&is->is_rawip_mib, rawipOutErrors);
3142 		goto done;
3143 	}
3144 	if (ipha->ipha_src == INADDR_ANY)
3145 		IN6_V4MAPPED_TO_IPADDR(&v6src, ipha->ipha_src);
3146 
3147 	/*
3148 	 * We might be going to a different destination than last time,
3149 	 * thus check that TX allows the communication and compute any
3150 	 * needed label.
3151 	 *
3152 	 * TSOL Note: We have an exclusive ipp and ixa for this thread so we
3153 	 * don't have to worry about concurrent threads.
3154 	 */
3155 	if (is_system_labeled()) {
3156 		/*
3157 		 * Check whether Trusted Solaris policy allows communication
3158 		 * with this host, and pretend that the destination is
3159 		 * unreachable if not.
3160 		 * Compute any needed label and place it in ipp_label_v4/v6.
3161 		 *
3162 		 * Later conn_build_hdr_template/conn_prepend_hdr takes
3163 		 * ipp_label_v4/v6 to form the packet.
3164 		 *
3165 		 * Tsol note: We have ipp structure local to this thread so
3166 		 * no locking is needed.
3167 		 */
3168 		error = conn_update_label(connp, ixa, &v6dst, ipp);
3169 		if (error != 0) {
3170 			freemsg(mp);
3171 			BUMP_MIB(&is->is_rawip_mib, rawipOutErrors);
3172 			goto done;
3173 		}
3174 	}
3175 
3176 	/*
3177 	 * Save away a copy of the IPv4 header the application passed down
3178 	 * and then prepend an IPv4 header complete with any IP options
3179 	 * including label.
3180 	 * We need a struct copy since icmp_prepend_hdr will reuse the available
3181 	 * space in the mblk.
3182 	 */
3183 	iphas = *ipha;
3184 	mp->b_rptr += IP_SIMPLE_HDR_LENGTH;
3185 
3186 	mp = icmp_prepend_hdr(connp, ixa, ipp, &v6src, &v6dst, 0, mp, &error);
3187 	if (mp == NULL) {
3188 		BUMP_MIB(&is->is_rawip_mib, rawipOutErrors);
3189 		ASSERT(error != 0);
3190 		goto done;
3191 	}
3192 	if (ixa->ixa_pktlen > IP_MAXPACKET) {
3193 		error = EMSGSIZE;
3194 		BUMP_MIB(&is->is_rawip_mib, rawipOutErrors);
3195 		freemsg(mp);
3196 		goto done;
3197 	}
3198 	/* Restore key parts of the header that the application passed down */
3199 	ipha = (ipha_t *)mp->b_rptr;
3200 	ipha->ipha_type_of_service = iphas.ipha_type_of_service;
3201 	ipha->ipha_ident = iphas.ipha_ident;
3202 	ipha->ipha_fragment_offset_and_flags =
3203 	    iphas.ipha_fragment_offset_and_flags;
3204 	ipha->ipha_ttl = iphas.ipha_ttl;
3205 	ipha->ipha_protocol = iphas.ipha_protocol;
3206 	ipha->ipha_src = iphas.ipha_src;
3207 	ipha->ipha_dst = iphas.ipha_dst;
3208 
3209 	ixa->ixa_protocol = ipha->ipha_protocol;
3210 
3211 	/*
3212 	 * Make sure that the IP header plus any transport header that is
3213 	 * checksumed by ip_output is in the first mblk. (ip_output assumes
3214 	 * that at least the checksum field is in the first mblk.)
3215 	 */
3216 	switch (ipha->ipha_protocol) {
3217 	case IPPROTO_UDP:
3218 		tp_hdr_len = 8;
3219 		break;
3220 	case IPPROTO_TCP:
3221 		tp_hdr_len = 20;
3222 		break;
3223 	default:
3224 		tp_hdr_len = 0;
3225 		break;
3226 	}
3227 	ip_hdr_length = IPH_HDR_LENGTH(ipha);
3228 	if (mp->b_wptr - mp->b_rptr < ip_hdr_length + tp_hdr_len) {
3229 		if (!pullupmsg(mp, ip_hdr_length + tp_hdr_len)) {
3230 			BUMP_MIB(&is->is_rawip_mib, rawipOutErrors);
3231 			if (mp->b_cont == NULL)
3232 				error = EINVAL;
3233 			else
3234 				error = ENOMEM;
3235 			freemsg(mp);
3236 			goto done;
3237 		}
3238 	}
3239 
3240 	if (!do_ipsec) {
3241 		/* Policy might differ for different ICMP type/code */
3242 		if (ixa->ixa_ipsec_policy != NULL) {
3243 			IPPOL_REFRELE(ixa->ixa_ipsec_policy);
3244 			ixa->ixa_ipsec_policy = NULL;
3245 			ixa->ixa_flags &= ~IXAF_IPSEC_SECURE;
3246 		}
3247 		mp = ip_output_attach_policy(mp, ipha, NULL, connp, ixa);
3248 		if (mp == NULL) {
3249 			BUMP_MIB(&is->is_rawip_mib, rawipOutErrors);
3250 			error = EHOSTUNREACH;	/* IPsec policy failure */
3251 			goto done;
3252 		}
3253 	}
3254 
3255 	/* We're done.  Pass the packet to ip. */
3256 	BUMP_MIB(&is->is_rawip_mib, rawipOutDatagrams);
3257 
3258 	error = conn_ip_output(mp, ixa);
3259 	/* No rawipOutErrors if an error since IP increases its error counter */
3260 	switch (error) {
3261 	case 0:
3262 		break;
3263 	case EWOULDBLOCK:
3264 		(void) ixa_check_drain_insert(connp, ixa);
3265 		error = 0;
3266 		break;
3267 	case EADDRNOTAVAIL:
3268 		/*
3269 		 * IXAF_VERIFY_SOURCE tells us to pick a better source.
3270 		 * Don't have the application see that errno
3271 		 */
3272 		error = ENETUNREACH;
3273 		break;
3274 	}
3275 done:
3276 	ixa_refrele(ixa);
3277 	ip_pkt_free(ipp);
3278 	kmem_free(ipp, sizeof (*ipp));
3279 	return (error);
3280 }
3281 
3282 static mblk_t *
3283 icmp_output_attach_policy(mblk_t *mp, conn_t *connp, ip_xmit_attr_t *ixa)
3284 {
3285 	ipha_t	*ipha = NULL;
3286 	ip6_t	*ip6h = NULL;
3287 
3288 	if (ixa->ixa_flags & IXAF_IS_IPV4)
3289 		ipha = (ipha_t *)mp->b_rptr;
3290 	else
3291 		ip6h = (ip6_t *)mp->b_rptr;
3292 
3293 	if (ixa->ixa_ipsec_policy != NULL) {
3294 		IPPOL_REFRELE(ixa->ixa_ipsec_policy);
3295 		ixa->ixa_ipsec_policy = NULL;
3296 		ixa->ixa_flags &= ~IXAF_IPSEC_SECURE;
3297 	}
3298 	return (ip_output_attach_policy(mp, ipha, ip6h, connp, ixa));
3299 }
3300 
3301 /*
3302  * Handle T_UNITDATA_REQ with options. Both IPv4 and IPv6
3303  * Either tudr_mp or msg is set. If tudr_mp we take ancillary data from
3304  * the TPI options, otherwise we take them from msg_control.
3305  * If both sin and sin6 is set it is a connected socket and we use conn_faddr.
3306  * Always consumes mp; never consumes tudr_mp.
3307  */
3308 static int
3309 icmp_output_ancillary(conn_t *connp, sin_t *sin, sin6_t *sin6, mblk_t *mp,
3310     mblk_t *tudr_mp, struct nmsghdr *msg, cred_t *cr, pid_t pid)
3311 {
3312 	icmp_t		*icmp = connp->conn_icmp;
3313 	icmp_stack_t	*is = icmp->icmp_is;
3314 	int		error;
3315 	ip_xmit_attr_t	*ixa;
3316 	ip_pkt_t	*ipp;
3317 	in6_addr_t	v6src;
3318 	in6_addr_t	v6dst;
3319 	in6_addr_t	v6nexthop;
3320 	in_port_t	dstport;
3321 	uint32_t	flowinfo;
3322 	uint_t		srcid;
3323 	int		is_absreq_failure = 0;
3324 	conn_opt_arg_t	coas, *coa;
3325 
3326 	ASSERT(tudr_mp != NULL || msg != NULL);
3327 
3328 	/*
3329 	 * Get ixa before checking state to handle a disconnect race.
3330 	 *
3331 	 * We need an exclusive copy of conn_ixa since the ancillary data
3332 	 * options might modify it. That copy has no pointers hence we
3333 	 * need to set them up once we've parsed the ancillary data.
3334 	 */
3335 	ixa = conn_get_ixa_exclusive(connp);
3336 	if (ixa == NULL) {
3337 		BUMP_MIB(&is->is_rawip_mib, rawipOutErrors);
3338 		freemsg(mp);
3339 		return (ENOMEM);
3340 	}
3341 	ASSERT(cr != NULL);
3342 	ixa->ixa_cred = cr;
3343 	ixa->ixa_cpid = pid;
3344 	if (is_system_labeled()) {
3345 		/* We need to restart with a label based on the cred */
3346 		ip_xmit_attr_restore_tsl(ixa, ixa->ixa_cred);
3347 	}
3348 
3349 	/* In case previous destination was multicast or multirt */
3350 	ip_attr_newdst(ixa);
3351 
3352 	/* Get a copy of conn_xmit_ipp since the options might change it */
3353 	ipp = kmem_zalloc(sizeof (*ipp), KM_NOSLEEP);
3354 	if (ipp == NULL) {
3355 		ixa_refrele(ixa);
3356 		BUMP_MIB(&is->is_rawip_mib, rawipOutErrors);
3357 		freemsg(mp);
3358 		return (ENOMEM);
3359 	}
3360 	mutex_enter(&connp->conn_lock);
3361 	error = ip_pkt_copy(&connp->conn_xmit_ipp, ipp, KM_NOSLEEP);
3362 	mutex_exit(&connp->conn_lock);
3363 	if (error != 0) {
3364 		BUMP_MIB(&is->is_rawip_mib, rawipOutErrors);
3365 		freemsg(mp);
3366 		goto done;
3367 	}
3368 
3369 	/*
3370 	 * Parse the options and update ixa and ipp as a result.
3371 	 */
3372 
3373 	coa = &coas;
3374 	coa->coa_connp = connp;
3375 	coa->coa_ixa = ixa;
3376 	coa->coa_ipp = ipp;
3377 	coa->coa_ancillary = B_TRUE;
3378 	coa->coa_changed = 0;
3379 
3380 	if (msg != NULL) {
3381 		error = process_auxiliary_options(connp, msg->msg_control,
3382 		    msg->msg_controllen, coa, &icmp_opt_obj, icmp_opt_set, cr);
3383 	} else {
3384 		struct T_unitdata_req *tudr;
3385 
3386 		tudr = (struct T_unitdata_req *)tudr_mp->b_rptr;
3387 		ASSERT(tudr->PRIM_type == T_UNITDATA_REQ);
3388 		error = tpi_optcom_buf(connp->conn_wq, tudr_mp,
3389 		    &tudr->OPT_length, tudr->OPT_offset, cr, &icmp_opt_obj,
3390 		    coa, &is_absreq_failure);
3391 	}
3392 	if (error != 0) {
3393 		/*
3394 		 * Note: No special action needed in this
3395 		 * module for "is_absreq_failure"
3396 		 */
3397 		freemsg(mp);
3398 		BUMP_MIB(&is->is_rawip_mib, rawipOutErrors);
3399 		goto done;
3400 	}
3401 	ASSERT(is_absreq_failure == 0);
3402 
3403 	mutex_enter(&connp->conn_lock);
3404 	/*
3405 	 * If laddr is unspecified then we look at sin6_src_id.
3406 	 * We will give precedence to a source address set with IPV6_PKTINFO
3407 	 * (aka IPPF_ADDR) but that is handled in build_hdrs. However, we don't
3408 	 * want ip_attr_connect to select a source (since it can fail) when
3409 	 * IPV6_PKTINFO is specified.
3410 	 * If this doesn't result in a source address then we get a source
3411 	 * from ip_attr_connect() below.
3412 	 */
3413 	v6src = connp->conn_saddr_v6;
3414 	if (sin != NULL) {
3415 		IN6_IPADDR_TO_V4MAPPED(sin->sin_addr.s_addr, &v6dst);
3416 		dstport = sin->sin_port;
3417 		flowinfo = 0;
3418 		ixa->ixa_flags &= ~IXAF_SCOPEID_SET;
3419 		ixa->ixa_flags |= IXAF_IS_IPV4;
3420 	} else if (sin6 != NULL) {
3421 		v6dst = sin6->sin6_addr;
3422 		dstport = sin6->sin6_port;
3423 		flowinfo = sin6->sin6_flowinfo;
3424 		srcid = sin6->__sin6_src_id;
3425 		if (IN6_IS_ADDR_LINKSCOPE(&v6dst) && sin6->sin6_scope_id != 0) {
3426 			ixa->ixa_scopeid = sin6->sin6_scope_id;
3427 			ixa->ixa_flags |= IXAF_SCOPEID_SET;
3428 		} else {
3429 			ixa->ixa_flags &= ~IXAF_SCOPEID_SET;
3430 		}
3431 		if (srcid != 0 && IN6_IS_ADDR_UNSPECIFIED(&v6src)) {
3432 			ip_srcid_find_id(srcid, &v6src, IPCL_ZONEID(connp),
3433 			    connp->conn_netstack);
3434 		}
3435 		if (IN6_IS_ADDR_V4MAPPED(&v6dst))
3436 			ixa->ixa_flags |= IXAF_IS_IPV4;
3437 		else
3438 			ixa->ixa_flags &= ~IXAF_IS_IPV4;
3439 	} else {
3440 		/* Connected case */
3441 		v6dst = connp->conn_faddr_v6;
3442 		flowinfo = connp->conn_flowinfo;
3443 	}
3444 	mutex_exit(&connp->conn_lock);
3445 	/* Handle IPV6_PKTINFO setting source address. */
3446 	if (IN6_IS_ADDR_UNSPECIFIED(&v6src) &&
3447 	    (ipp->ipp_fields & IPPF_ADDR)) {
3448 		if (ixa->ixa_flags & IXAF_IS_IPV4) {
3449 			if (IN6_IS_ADDR_V4MAPPED(&ipp->ipp_addr))
3450 				v6src = ipp->ipp_addr;
3451 		} else {
3452 			if (!IN6_IS_ADDR_V4MAPPED(&ipp->ipp_addr))
3453 				v6src = ipp->ipp_addr;
3454 		}
3455 	}
3456 
3457 	ip_attr_nexthop(ipp, ixa, &v6dst, &v6nexthop);
3458 	error = ip_attr_connect(connp, ixa, &v6src, &v6dst, &v6nexthop, dstport,
3459 	    &v6src, NULL, IPDF_ALLOW_MCBC | IPDF_VERIFY_DST);
3460 
3461 	switch (error) {
3462 	case 0:
3463 		break;
3464 	case EADDRNOTAVAIL:
3465 		/*
3466 		 * IXAF_VERIFY_SOURCE tells us to pick a better source.
3467 		 * Don't have the application see that errno
3468 		 */
3469 		error = ENETUNREACH;
3470 		goto failed;
3471 	case ENETDOWN:
3472 		/*
3473 		 * Have !ipif_addr_ready address; drop packet silently
3474 		 * until we can get applications to not send until we
3475 		 * are ready.
3476 		 */
3477 		error = 0;
3478 		goto failed;
3479 	case EHOSTUNREACH:
3480 	case ENETUNREACH:
3481 		if (ixa->ixa_ire != NULL) {
3482 			/*
3483 			 * Let conn_ip_output/ire_send_noroute return
3484 			 * the error and send any local ICMP error.
3485 			 */
3486 			error = 0;
3487 			break;
3488 		}
3489 		/* FALLTHRU */
3490 	default:
3491 	failed:
3492 		freemsg(mp);
3493 		BUMP_MIB(&is->is_rawip_mib, rawipOutErrors);
3494 		goto done;
3495 	}
3496 
3497 	/*
3498 	 * We might be going to a different destination than last time,
3499 	 * thus check that TX allows the communication and compute any
3500 	 * needed label.
3501 	 *
3502 	 * TSOL Note: We have an exclusive ipp and ixa for this thread so we
3503 	 * don't have to worry about concurrent threads.
3504 	 */
3505 	if (is_system_labeled()) {
3506 		/*
3507 		 * Check whether Trusted Solaris policy allows communication
3508 		 * with this host, and pretend that the destination is
3509 		 * unreachable if not.
3510 		 * Compute any needed label and place it in ipp_label_v4/v6.
3511 		 *
3512 		 * Later conn_build_hdr_template/conn_prepend_hdr takes
3513 		 * ipp_label_v4/v6 to form the packet.
3514 		 *
3515 		 * Tsol note: We have ipp structure local to this thread so
3516 		 * no locking is needed.
3517 		 */
3518 		error = conn_update_label(connp, ixa, &v6dst, ipp);
3519 		if (error != 0) {
3520 			freemsg(mp);
3521 			BUMP_MIB(&is->is_rawip_mib, rawipOutErrors);
3522 			goto done;
3523 		}
3524 	}
3525 	mp = icmp_prepend_hdr(connp, ixa, ipp, &v6src, &v6dst, flowinfo, mp,
3526 	    &error);
3527 	if (mp == NULL) {
3528 		BUMP_MIB(&is->is_rawip_mib, rawipOutErrors);
3529 		ASSERT(error != 0);
3530 		goto done;
3531 	}
3532 	if (ixa->ixa_pktlen > IP_MAXPACKET) {
3533 		error = EMSGSIZE;
3534 		BUMP_MIB(&is->is_rawip_mib, rawipOutErrors);
3535 		freemsg(mp);
3536 		goto done;
3537 	}
3538 
3539 	/* Policy might differ for different ICMP type/code */
3540 	mp = icmp_output_attach_policy(mp, connp, ixa);
3541 	if (mp == NULL) {
3542 		BUMP_MIB(&is->is_rawip_mib, rawipOutErrors);
3543 		error = EHOSTUNREACH;	/* IPsec policy failure */
3544 		goto done;
3545 	}
3546 
3547 	/* We're done.  Pass the packet to ip. */
3548 	BUMP_MIB(&is->is_rawip_mib, rawipOutDatagrams);
3549 
3550 	/* Allow source not assigned to the system? */
3551 	ixa->ixa_flags &= ~IXAF_VERIFY_SOURCE;
3552 	error = conn_ip_output(mp, ixa);
3553 	if (!connp->conn_unspec_src)
3554 		ixa->ixa_flags |= IXAF_VERIFY_SOURCE;
3555 	/* No rawipOutErrors if an error since IP increases its error counter */
3556 	switch (error) {
3557 	case 0:
3558 		break;
3559 	case EWOULDBLOCK:
3560 		(void) ixa_check_drain_insert(connp, ixa);
3561 		error = 0;
3562 		break;
3563 	case EADDRNOTAVAIL:
3564 		/*
3565 		 * IXAF_VERIFY_SOURCE tells us to pick a better source.
3566 		 * Don't have the application see that errno
3567 		 */
3568 		error = ENETUNREACH;
3569 		/* FALLTHRU */
3570 	default:
3571 		mutex_enter(&connp->conn_lock);
3572 		/*
3573 		 * Clear the source and v6lastdst so we call ip_attr_connect
3574 		 * for the next packet and try to pick a better source.
3575 		 */
3576 		if (connp->conn_mcbc_bind)
3577 			connp->conn_saddr_v6 = ipv6_all_zeros;
3578 		else
3579 			connp->conn_saddr_v6 = connp->conn_bound_addr_v6;
3580 		connp->conn_v6lastdst = ipv6_all_zeros;
3581 		mutex_exit(&connp->conn_lock);
3582 		break;
3583 	}
3584 done:
3585 	ixa_refrele(ixa);
3586 	ip_pkt_free(ipp);
3587 	kmem_free(ipp, sizeof (*ipp));
3588 	return (error);
3589 }
3590 
3591 /*
3592  * Handle sending an M_DATA for a connected socket.
3593  * Handles both IPv4 and IPv6.
3594  */
3595 int
3596 icmp_output_connected(conn_t *connp, mblk_t *mp, cred_t *cr, pid_t pid)
3597 {
3598 	icmp_t		*icmp = connp->conn_icmp;
3599 	icmp_stack_t	*is = icmp->icmp_is;
3600 	int		error;
3601 	ip_xmit_attr_t	*ixa;
3602 	boolean_t	do_ipsec;
3603 
3604 	/*
3605 	 * If no other thread is using conn_ixa this just gets a reference to
3606 	 * conn_ixa. Otherwise we get a safe copy of conn_ixa.
3607 	 */
3608 	ixa = conn_get_ixa(connp, B_FALSE);
3609 	if (ixa == NULL) {
3610 		BUMP_MIB(&is->is_rawip_mib, rawipOutErrors);
3611 		freemsg(mp);
3612 		return (ENOMEM);
3613 	}
3614 
3615 	ASSERT(cr != NULL);
3616 	ixa->ixa_cred = cr;
3617 	ixa->ixa_cpid = pid;
3618 
3619 	/* Defer IPsec if it might need to look at ICMP type/code */
3620 	switch (ixa->ixa_protocol) {
3621 	case IPPROTO_ICMP:
3622 	case IPPROTO_ICMPV6:
3623 		do_ipsec = B_FALSE;
3624 		break;
3625 	default:
3626 		do_ipsec = B_TRUE;
3627 	}
3628 
3629 	mutex_enter(&connp->conn_lock);
3630 	mp = icmp_prepend_header_template(connp, ixa, mp,
3631 	    &connp->conn_saddr_v6, connp->conn_flowinfo, &error);
3632 
3633 	if (mp == NULL) {
3634 		ASSERT(error != 0);
3635 		mutex_exit(&connp->conn_lock);
3636 		ixa_refrele(ixa);
3637 		BUMP_MIB(&is->is_rawip_mib, rawipOutErrors);
3638 		freemsg(mp);
3639 		return (error);
3640 	}
3641 
3642 	if (!do_ipsec) {
3643 		/* Policy might differ for different ICMP type/code */
3644 		mp = icmp_output_attach_policy(mp, connp, ixa);
3645 		if (mp == NULL) {
3646 			mutex_exit(&connp->conn_lock);
3647 			BUMP_MIB(&is->is_rawip_mib, rawipOutErrors);
3648 			ixa_refrele(ixa);
3649 			return (EHOSTUNREACH);	/* IPsec policy failure */
3650 		}
3651 	}
3652 
3653 	/*
3654 	 * In case we got a safe copy of conn_ixa, or if opt_set made us a new
3655 	 * safe copy, then we need to fill in any pointers in it.
3656 	 */
3657 	if (ixa->ixa_ire == NULL) {
3658 		in6_addr_t	faddr, saddr;
3659 		in6_addr_t	nexthop;
3660 		in_port_t	fport;
3661 
3662 		saddr = connp->conn_saddr_v6;
3663 		faddr = connp->conn_faddr_v6;
3664 		fport = connp->conn_fport;
3665 		ip_attr_nexthop(&connp->conn_xmit_ipp, ixa, &faddr, &nexthop);
3666 		mutex_exit(&connp->conn_lock);
3667 
3668 		error = ip_attr_connect(connp, ixa, &saddr, &faddr, &nexthop,
3669 		    fport, NULL, NULL, IPDF_ALLOW_MCBC | IPDF_VERIFY_DST |
3670 		    (do_ipsec ? IPDF_IPSEC : 0));
3671 		switch (error) {
3672 		case 0:
3673 			break;
3674 		case EADDRNOTAVAIL:
3675 			/*
3676 			 * IXAF_VERIFY_SOURCE tells us to pick a better source.
3677 			 * Don't have the application see that errno
3678 			 */
3679 			error = ENETUNREACH;
3680 			goto failed;
3681 		case ENETDOWN:
3682 			/*
3683 			 * Have !ipif_addr_ready address; drop packet silently
3684 			 * until we can get applications to not send until we
3685 			 * are ready.
3686 			 */
3687 			error = 0;
3688 			goto failed;
3689 		case EHOSTUNREACH:
3690 		case ENETUNREACH:
3691 			if (ixa->ixa_ire != NULL) {
3692 				/*
3693 				 * Let conn_ip_output/ire_send_noroute return
3694 				 * the error and send any local ICMP error.
3695 				 */
3696 				error = 0;
3697 				break;
3698 			}
3699 			/* FALLTHRU */
3700 		default:
3701 		failed:
3702 			ixa_refrele(ixa);
3703 			BUMP_MIB(&is->is_rawip_mib, rawipOutErrors);
3704 			freemsg(mp);
3705 			return (error);
3706 		}
3707 	} else {
3708 		/* Done with conn_t */
3709 		mutex_exit(&connp->conn_lock);
3710 	}
3711 
3712 	/* We're done.  Pass the packet to ip. */
3713 	BUMP_MIB(&is->is_rawip_mib, rawipOutDatagrams);
3714 
3715 	error = conn_ip_output(mp, ixa);
3716 	/* No rawipOutErrors if an error since IP increases its error counter */
3717 	switch (error) {
3718 	case 0:
3719 		break;
3720 	case EWOULDBLOCK:
3721 		(void) ixa_check_drain_insert(connp, ixa);
3722 		error = 0;
3723 		break;
3724 	case EADDRNOTAVAIL:
3725 		/*
3726 		 * IXAF_VERIFY_SOURCE tells us to pick a better source.
3727 		 * Don't have the application see that errno
3728 		 */
3729 		error = ENETUNREACH;
3730 		break;
3731 	}
3732 	ixa_refrele(ixa);
3733 	return (error);
3734 }
3735 
3736 /*
3737  * Handle sending an M_DATA to the last destination.
3738  * Handles both IPv4 and IPv6.
3739  *
3740  * NOTE: The caller must hold conn_lock and we drop it here.
3741  */
3742 int
3743 icmp_output_lastdst(conn_t *connp, mblk_t *mp, cred_t *cr, pid_t pid,
3744     ip_xmit_attr_t *ixa)
3745 {
3746 	icmp_t		*icmp = connp->conn_icmp;
3747 	icmp_stack_t	*is = icmp->icmp_is;
3748 	int		error;
3749 	boolean_t	do_ipsec;
3750 
3751 	ASSERT(MUTEX_HELD(&connp->conn_lock));
3752 	ASSERT(ixa != NULL);
3753 
3754 	ASSERT(cr != NULL);
3755 	ixa->ixa_cred = cr;
3756 	ixa->ixa_cpid = pid;
3757 
3758 	/* Defer IPsec if it might need to look at ICMP type/code */
3759 	switch (ixa->ixa_protocol) {
3760 	case IPPROTO_ICMP:
3761 	case IPPROTO_ICMPV6:
3762 		do_ipsec = B_FALSE;
3763 		break;
3764 	default:
3765 		do_ipsec = B_TRUE;
3766 	}
3767 
3768 
3769 	mp = icmp_prepend_header_template(connp, ixa, mp,
3770 	    &connp->conn_v6lastsrc, connp->conn_lastflowinfo, &error);
3771 
3772 	if (mp == NULL) {
3773 		ASSERT(error != 0);
3774 		mutex_exit(&connp->conn_lock);
3775 		ixa_refrele(ixa);
3776 		BUMP_MIB(&is->is_rawip_mib, rawipOutErrors);
3777 		freemsg(mp);
3778 		return (error);
3779 	}
3780 
3781 	if (!do_ipsec) {
3782 		/* Policy might differ for different ICMP type/code */
3783 		mp = icmp_output_attach_policy(mp, connp, ixa);
3784 		if (mp == NULL) {
3785 			mutex_exit(&connp->conn_lock);
3786 			BUMP_MIB(&is->is_rawip_mib, rawipOutErrors);
3787 			ixa_refrele(ixa);
3788 			return (EHOSTUNREACH);	/* IPsec policy failure */
3789 		}
3790 	}
3791 
3792 	/*
3793 	 * In case we got a safe copy of conn_ixa, or if opt_set made us a new
3794 	 * safe copy, then we need to fill in any pointers in it.
3795 	 */
3796 	if (ixa->ixa_ire == NULL) {
3797 		in6_addr_t	lastdst, lastsrc;
3798 		in6_addr_t	nexthop;
3799 		in_port_t	lastport;
3800 
3801 		lastsrc = connp->conn_v6lastsrc;
3802 		lastdst = connp->conn_v6lastdst;
3803 		lastport = connp->conn_lastdstport;
3804 		ip_attr_nexthop(&connp->conn_xmit_ipp, ixa, &lastdst, &nexthop);
3805 		mutex_exit(&connp->conn_lock);
3806 
3807 		error = ip_attr_connect(connp, ixa, &lastsrc, &lastdst,
3808 		    &nexthop, lastport, NULL, NULL, IPDF_ALLOW_MCBC |
3809 		    IPDF_VERIFY_DST | (do_ipsec ? IPDF_IPSEC : 0));
3810 		switch (error) {
3811 		case 0:
3812 			break;
3813 		case EADDRNOTAVAIL:
3814 			/*
3815 			 * IXAF_VERIFY_SOURCE tells us to pick a better source.
3816 			 * Don't have the application see that errno
3817 			 */
3818 			error = ENETUNREACH;
3819 			goto failed;
3820 		case ENETDOWN:
3821 			/*
3822 			 * Have !ipif_addr_ready address; drop packet silently
3823 			 * until we can get applications to not send until we
3824 			 * are ready.
3825 			 */
3826 			error = 0;
3827 			goto failed;
3828 		case EHOSTUNREACH:
3829 		case ENETUNREACH:
3830 			if (ixa->ixa_ire != NULL) {
3831 				/*
3832 				 * Let conn_ip_output/ire_send_noroute return
3833 				 * the error and send any local ICMP error.
3834 				 */
3835 				error = 0;
3836 				break;
3837 			}
3838 			/* FALLTHRU */
3839 		default:
3840 		failed:
3841 			ixa_refrele(ixa);
3842 			BUMP_MIB(&is->is_rawip_mib, rawipOutErrors);
3843 			freemsg(mp);
3844 			return (error);
3845 		}
3846 	} else {
3847 		/* Done with conn_t */
3848 		mutex_exit(&connp->conn_lock);
3849 	}
3850 
3851 	/* We're done.  Pass the packet to ip. */
3852 	BUMP_MIB(&is->is_rawip_mib, rawipOutDatagrams);
3853 	error = conn_ip_output(mp, ixa);
3854 	/* No rawipOutErrors if an error since IP increases its error counter */
3855 	switch (error) {
3856 	case 0:
3857 		break;
3858 	case EWOULDBLOCK:
3859 		(void) ixa_check_drain_insert(connp, ixa);
3860 		error = 0;
3861 		break;
3862 	case EADDRNOTAVAIL:
3863 		/*
3864 		 * IXAF_VERIFY_SOURCE tells us to pick a better source.
3865 		 * Don't have the application see that errno
3866 		 */
3867 		error = ENETUNREACH;
3868 		/* FALLTHRU */
3869 	default:
3870 		mutex_enter(&connp->conn_lock);
3871 		/*
3872 		 * Clear the source and v6lastdst so we call ip_attr_connect
3873 		 * for the next packet and try to pick a better source.
3874 		 */
3875 		if (connp->conn_mcbc_bind)
3876 			connp->conn_saddr_v6 = ipv6_all_zeros;
3877 		else
3878 			connp->conn_saddr_v6 = connp->conn_bound_addr_v6;
3879 		connp->conn_v6lastdst = ipv6_all_zeros;
3880 		mutex_exit(&connp->conn_lock);
3881 		break;
3882 	}
3883 	ixa_refrele(ixa);
3884 	return (error);
3885 }
3886 
3887 
3888 /*
3889  * Prepend the header template and then fill in the source and
3890  * flowinfo. The caller needs to handle the destination address since
3891  * it's setting is different if rthdr or source route.
3892  *
3893  * Returns NULL is allocation failed or if the packet would exceed IP_MAXPACKET.
3894  * When it returns NULL it sets errorp.
3895  */
3896 static mblk_t *
3897 icmp_prepend_header_template(conn_t *connp, ip_xmit_attr_t *ixa, mblk_t *mp,
3898     const in6_addr_t *v6src, uint32_t flowinfo, int *errorp)
3899 {
3900 	icmp_t		*icmp = connp->conn_icmp;
3901 	icmp_stack_t	*is = icmp->icmp_is;
3902 	uint_t		pktlen;
3903 	uint_t		copylen;
3904 	uint8_t		*iph;
3905 	uint_t		ip_hdr_length;
3906 	uint32_t	cksum;
3907 	ip_pkt_t	*ipp;
3908 
3909 	ASSERT(MUTEX_HELD(&connp->conn_lock));
3910 
3911 	/*
3912 	 * Copy the header template.
3913 	 */
3914 	copylen = connp->conn_ht_iphc_len;
3915 	pktlen = copylen + msgdsize(mp);
3916 	if (pktlen > IP_MAXPACKET) {
3917 		freemsg(mp);
3918 		*errorp = EMSGSIZE;
3919 		return (NULL);
3920 	}
3921 	ixa->ixa_pktlen = pktlen;
3922 
3923 	/* check/fix buffer config, setup pointers into it */
3924 	iph = mp->b_rptr - copylen;
3925 	if (DB_REF(mp) != 1 || iph < DB_BASE(mp) || !OK_32PTR(iph)) {
3926 		mblk_t *mp1;
3927 
3928 		mp1 = allocb(copylen + is->is_wroff_extra, BPRI_MED);
3929 		if (mp1 == NULL) {
3930 			freemsg(mp);
3931 			*errorp = ENOMEM;
3932 			return (NULL);
3933 		}
3934 		mp1->b_wptr = DB_LIM(mp1);
3935 		mp1->b_cont = mp;
3936 		mp = mp1;
3937 		iph = (mp->b_wptr - copylen);
3938 	}
3939 	mp->b_rptr = iph;
3940 	bcopy(connp->conn_ht_iphc, iph, copylen);
3941 	ip_hdr_length = (uint_t)(connp->conn_ht_ulp - connp->conn_ht_iphc);
3942 
3943 	ixa->ixa_ip_hdr_length = ip_hdr_length;
3944 
3945 	/*
3946 	 * Prepare for ICMPv6 checksum done in IP.
3947 	 *
3948 	 * icmp_build_hdr_template has already massaged any routing header
3949 	 * and placed the result in conn_sum.
3950 	 *
3951 	 * We make it easy for IP to include our pseudo header
3952 	 * by putting our length (and any routing header adjustment)
3953 	 * in the ICMPv6 checksum field.
3954 	 */
3955 	cksum = pktlen - ip_hdr_length;
3956 
3957 	cksum += connp->conn_sum;
3958 	cksum = (cksum >> 16) + (cksum & 0xFFFF);
3959 	ASSERT(cksum < 0x10000);
3960 
3961 	ipp = &connp->conn_xmit_ipp;
3962 	if (ixa->ixa_flags & IXAF_IS_IPV4) {
3963 		ipha_t	*ipha = (ipha_t *)iph;
3964 
3965 		ipha->ipha_length = htons((uint16_t)pktlen);
3966 
3967 		/* if IP_PKTINFO specified an addres it wins over bind() */
3968 		if ((ipp->ipp_fields & IPPF_ADDR) &&
3969 		    IN6_IS_ADDR_V4MAPPED(&ipp->ipp_addr)) {
3970 			ASSERT(ipp->ipp_addr_v4 != INADDR_ANY);
3971 			ipha->ipha_src = ipp->ipp_addr_v4;
3972 		} else {
3973 			IN6_V4MAPPED_TO_IPADDR(v6src, ipha->ipha_src);
3974 		}
3975 	} else {
3976 		ip6_t *ip6h = (ip6_t *)iph;
3977 		uint_t	cksum_offset = 0;
3978 
3979 		ip6h->ip6_plen =  htons((uint16_t)(pktlen - IPV6_HDR_LEN));
3980 
3981 		/* if IP_PKTINFO specified an addres it wins over bind() */
3982 		if ((ipp->ipp_fields & IPPF_ADDR) &&
3983 		    !IN6_IS_ADDR_V4MAPPED(&ipp->ipp_addr)) {
3984 			ASSERT(!IN6_IS_ADDR_UNSPECIFIED(&ipp->ipp_addr));
3985 			ip6h->ip6_src = ipp->ipp_addr;
3986 		} else {
3987 			ip6h->ip6_src = *v6src;
3988 		}
3989 		ip6h->ip6_vcf =
3990 		    (IPV6_DEFAULT_VERS_AND_FLOW & IPV6_VERS_AND_FLOW_MASK) |
3991 		    (flowinfo & ~IPV6_VERS_AND_FLOW_MASK);
3992 		if (ipp->ipp_fields & IPPF_TCLASS) {
3993 			/* Overrides the class part of flowinfo */
3994 			ip6h->ip6_vcf = IPV6_TCLASS_FLOW(ip6h->ip6_vcf,
3995 			    ipp->ipp_tclass);
3996 		}
3997 
3998 		if (ixa->ixa_flags & IXAF_SET_ULP_CKSUM) {
3999 			if (connp->conn_proto == IPPROTO_ICMPV6) {
4000 				cksum_offset = ixa->ixa_ip_hdr_length +
4001 				    offsetof(icmp6_t, icmp6_cksum);
4002 			} else if (ixa->ixa_flags & IXAF_SET_RAW_CKSUM) {
4003 				cksum_offset = ixa->ixa_ip_hdr_length +
4004 				    ixa->ixa_raw_cksum_offset;
4005 			}
4006 		}
4007 		if (cksum_offset != 0) {
4008 			uint16_t *ptr;
4009 
4010 			/* Make sure the checksum fits in the first mblk */
4011 			if (cksum_offset + sizeof (short) > MBLKL(mp)) {
4012 				mblk_t *mp1;
4013 
4014 				mp1 = msgpullup(mp,
4015 				    cksum_offset + sizeof (short));
4016 				freemsg(mp);
4017 				if (mp1 == NULL) {
4018 					*errorp = ENOMEM;
4019 					return (NULL);
4020 				}
4021 				mp = mp1;
4022 				iph = mp->b_rptr;
4023 				ip6h = (ip6_t *)iph;
4024 			}
4025 			ptr = (uint16_t *)(mp->b_rptr + cksum_offset);
4026 			*ptr = htons(cksum);
4027 		}
4028 	}
4029 
4030 	return (mp);
4031 }
4032 
4033 /*
4034  * This routine handles all messages passed downstream.  It either
4035  * consumes the message or passes it downstream; it never queues a
4036  * a message.
4037  */
4038 void
4039 icmp_wput(queue_t *q, mblk_t *mp)
4040 {
4041 	sin6_t		*sin6;
4042 	sin_t		*sin = NULL;
4043 	uint_t		srcid;
4044 	conn_t		*connp = Q_TO_CONN(q);
4045 	icmp_t		*icmp = connp->conn_icmp;
4046 	int		error = 0;
4047 	struct sockaddr	*addr = NULL;
4048 	socklen_t	addrlen;
4049 	icmp_stack_t	*is = icmp->icmp_is;
4050 	struct T_unitdata_req *tudr;
4051 	mblk_t		*data_mp;
4052 	cred_t		*cr;
4053 	pid_t		pid;
4054 
4055 	/*
4056 	 * We directly handle several cases here: T_UNITDATA_REQ message
4057 	 * coming down as M_PROTO/M_PCPROTO and M_DATA messages for connected
4058 	 * socket.
4059 	 */
4060 	switch (DB_TYPE(mp)) {
4061 	case M_DATA:
4062 		/* sockfs never sends down M_DATA */
4063 		BUMP_MIB(&is->is_rawip_mib, rawipOutErrors);
4064 		freemsg(mp);
4065 		return;
4066 
4067 	case M_PROTO:
4068 	case M_PCPROTO:
4069 		tudr = (struct T_unitdata_req *)mp->b_rptr;
4070 		if (MBLKL(mp) < sizeof (*tudr) ||
4071 		    ((t_primp_t)mp->b_rptr)->type != T_UNITDATA_REQ) {
4072 			icmp_wput_other(q, mp);
4073 			return;
4074 		}
4075 		break;
4076 
4077 	default:
4078 		icmp_wput_other(q, mp);
4079 		return;
4080 	}
4081 
4082 	/* Handle valid T_UNITDATA_REQ here */
4083 	data_mp = mp->b_cont;
4084 	if (data_mp == NULL) {
4085 		error = EPROTO;
4086 		goto ud_error2;
4087 	}
4088 	mp->b_cont = NULL;
4089 
4090 	if (!MBLKIN(mp, 0, tudr->DEST_offset + tudr->DEST_length)) {
4091 		error = EADDRNOTAVAIL;
4092 		goto ud_error2;
4093 	}
4094 
4095 	/*
4096 	 * All Solaris components should pass a db_credp
4097 	 * for this message, hence we ASSERT.
4098 	 * On production kernels we return an error to be robust against
4099 	 * random streams modules sitting on top of us.
4100 	 */
4101 	cr = msg_getcred(mp, &pid);
4102 	ASSERT(cr != NULL);
4103 	if (cr == NULL) {
4104 		error = EINVAL;
4105 		goto ud_error2;
4106 	}
4107 
4108 	/*
4109 	 * If a port has not been bound to the stream, fail.
4110 	 * This is not a problem when sockfs is directly
4111 	 * above us, because it will ensure that the socket
4112 	 * is first bound before allowing data to be sent.
4113 	 */
4114 	if (icmp->icmp_state == TS_UNBND) {
4115 		error = EPROTO;
4116 		goto ud_error2;
4117 	}
4118 	addr = (struct sockaddr *)&mp->b_rptr[tudr->DEST_offset];
4119 	addrlen = tudr->DEST_length;
4120 
4121 	switch (connp->conn_family) {
4122 	case AF_INET6:
4123 		sin6 = (sin6_t *)addr;
4124 		if (!OK_32PTR((char *)sin6) || (addrlen != sizeof (sin6_t)) ||
4125 		    (sin6->sin6_family != AF_INET6)) {
4126 			error = EADDRNOTAVAIL;
4127 			goto ud_error2;
4128 		}
4129 
4130 		/* No support for mapped addresses on raw sockets */
4131 		if (IN6_IS_ADDR_V4MAPPED(&sin6->sin6_addr)) {
4132 			error = EADDRNOTAVAIL;
4133 			goto ud_error2;
4134 		}
4135 		srcid = sin6->__sin6_src_id;
4136 
4137 		/*
4138 		 * If the local address is a mapped address return
4139 		 * an error.
4140 		 * It would be possible to send an IPv6 packet but the
4141 		 * response would never make it back to the application
4142 		 * since it is bound to a mapped address.
4143 		 */
4144 		if (IN6_IS_ADDR_V4MAPPED(&connp->conn_saddr_v6)) {
4145 			error = EADDRNOTAVAIL;
4146 			goto ud_error2;
4147 		}
4148 
4149 		if (IN6_IS_ADDR_UNSPECIFIED(&sin6->sin6_addr))
4150 			sin6->sin6_addr = ipv6_loopback;
4151 
4152 		if (tudr->OPT_length != 0) {
4153 			/*
4154 			 * If we are connected then the destination needs to be
4155 			 * the same as the connected one.
4156 			 */
4157 			if (icmp->icmp_state == TS_DATA_XFER &&
4158 			    !conn_same_as_last_v6(connp, sin6)) {
4159 				error = EISCONN;
4160 				goto ud_error2;
4161 			}
4162 			error = icmp_output_ancillary(connp, NULL, sin6,
4163 			    data_mp, mp, NULL, cr, pid);
4164 		} else {
4165 			ip_xmit_attr_t *ixa;
4166 
4167 			/*
4168 			 * We have to allocate an ip_xmit_attr_t before we grab
4169 			 * conn_lock and we need to hold conn_lock once we've
4170 			 * checked conn_same_as_last_v6 to handle concurrent
4171 			 * send* calls on a socket.
4172 			 */
4173 			ixa = conn_get_ixa(connp, B_FALSE);
4174 			if (ixa == NULL) {
4175 				error = ENOMEM;
4176 				goto ud_error2;
4177 			}
4178 			mutex_enter(&connp->conn_lock);
4179 
4180 			if (conn_same_as_last_v6(connp, sin6) &&
4181 			    connp->conn_lastsrcid == srcid &&
4182 			    ipsec_outbound_policy_current(ixa)) {
4183 				/* icmp_output_lastdst drops conn_lock */
4184 				error = icmp_output_lastdst(connp, data_mp, cr,
4185 				    pid, ixa);
4186 			} else {
4187 				/* icmp_output_newdst drops conn_lock */
4188 				error = icmp_output_newdst(connp, data_mp, NULL,
4189 				    sin6, cr, pid, ixa);
4190 			}
4191 			ASSERT(MUTEX_NOT_HELD(&connp->conn_lock));
4192 		}
4193 		if (error == 0) {
4194 			freeb(mp);
4195 			return;
4196 		}
4197 		break;
4198 
4199 	case AF_INET:
4200 		sin = (sin_t *)addr;
4201 		if ((!OK_32PTR((char *)sin) || addrlen != sizeof (sin_t)) ||
4202 		    (sin->sin_family != AF_INET)) {
4203 			error = EADDRNOTAVAIL;
4204 			goto ud_error2;
4205 		}
4206 		if (sin->sin_addr.s_addr == INADDR_ANY)
4207 			sin->sin_addr.s_addr = htonl(INADDR_LOOPBACK);
4208 
4209 		/* Protocol 255 contains full IP headers */
4210 		/* Read without holding lock */
4211 		if (icmp->icmp_hdrincl) {
4212 			if (MBLKL(data_mp) < IP_SIMPLE_HDR_LENGTH) {
4213 				if (!pullupmsg(data_mp, IP_SIMPLE_HDR_LENGTH)) {
4214 					error = EINVAL;
4215 					goto ud_error2;
4216 				}
4217 			}
4218 			error = icmp_output_hdrincl(connp, data_mp, cr, pid);
4219 			if (error == 0) {
4220 				freeb(mp);
4221 				return;
4222 			}
4223 			/* data_mp consumed above */
4224 			data_mp = NULL;
4225 			goto ud_error2;
4226 		}
4227 
4228 		if (tudr->OPT_length != 0) {
4229 			/*
4230 			 * If we are connected then the destination needs to be
4231 			 * the same as the connected one.
4232 			 */
4233 			if (icmp->icmp_state == TS_DATA_XFER &&
4234 			    !conn_same_as_last_v4(connp, sin)) {
4235 				error = EISCONN;
4236 				goto ud_error2;
4237 			}
4238 			error = icmp_output_ancillary(connp, sin, NULL,
4239 			    data_mp, mp, NULL, cr, pid);
4240 		} else {
4241 			ip_xmit_attr_t *ixa;
4242 
4243 			/*
4244 			 * We have to allocate an ip_xmit_attr_t before we grab
4245 			 * conn_lock and we need to hold conn_lock once we've
4246 			 * checked conn_same_as_last_v4 to handle concurrent
4247 			 * send* calls on a socket.
4248 			 */
4249 			ixa = conn_get_ixa(connp, B_FALSE);
4250 			if (ixa == NULL) {
4251 				error = ENOMEM;
4252 				goto ud_error2;
4253 			}
4254 			mutex_enter(&connp->conn_lock);
4255 
4256 			if (conn_same_as_last_v4(connp, sin) &&
4257 			    ipsec_outbound_policy_current(ixa)) {
4258 				/* icmp_output_lastdst drops conn_lock */
4259 				error = icmp_output_lastdst(connp, data_mp, cr,
4260 				    pid, ixa);
4261 			} else {
4262 				/* icmp_output_newdst drops conn_lock */
4263 				error = icmp_output_newdst(connp, data_mp, sin,
4264 				    NULL, cr, pid, ixa);
4265 			}
4266 			ASSERT(MUTEX_NOT_HELD(&connp->conn_lock));
4267 		}
4268 		if (error == 0) {
4269 			freeb(mp);
4270 			return;
4271 		}
4272 		break;
4273 	}
4274 	ASSERT(mp != NULL);
4275 	/* mp is freed by the following routine */
4276 	icmp_ud_err(q, mp, (t_scalar_t)error);
4277 	return;
4278 
4279 ud_error2:
4280 	BUMP_MIB(&is->is_rawip_mib, rawipOutErrors);
4281 	freemsg(data_mp);
4282 	ASSERT(mp != NULL);
4283 	/* mp is freed by the following routine */
4284 	icmp_ud_err(q, mp, (t_scalar_t)error);
4285 }
4286 
4287 /*
4288  * Handle the case of the IP address or flow label being different
4289  * for both IPv4 and IPv6.
4290  *
4291  * NOTE: The caller must hold conn_lock and we drop it here.
4292  */
4293 static int
4294 icmp_output_newdst(conn_t *connp, mblk_t *data_mp, sin_t *sin, sin6_t *sin6,
4295     cred_t *cr, pid_t pid, ip_xmit_attr_t *ixa)
4296 {
4297 	icmp_t		*icmp = connp->conn_icmp;
4298 	icmp_stack_t	*is = icmp->icmp_is;
4299 	int		error;
4300 	ip_xmit_attr_t	*oldixa;
4301 	boolean_t	do_ipsec;
4302 	uint_t		srcid;
4303 	uint32_t	flowinfo;
4304 	in6_addr_t	v6src;
4305 	in6_addr_t	v6dst;
4306 	in6_addr_t	v6nexthop;
4307 	in_port_t	dstport;
4308 
4309 	ASSERT(MUTEX_HELD(&connp->conn_lock));
4310 	ASSERT(ixa != NULL);
4311 
4312 	/*
4313 	 * We hold conn_lock across all the use and modifications of
4314 	 * the conn_lastdst, conn_ixa, and conn_xmit_ipp to ensure that they
4315 	 * stay consistent.
4316 	 */
4317 
4318 	ASSERT(cr != NULL);
4319 	ixa->ixa_cred = cr;
4320 	ixa->ixa_cpid = pid;
4321 	if (is_system_labeled()) {
4322 		/* We need to restart with a label based on the cred */
4323 		ip_xmit_attr_restore_tsl(ixa, ixa->ixa_cred);
4324 	}
4325 	/*
4326 	 * If we are connected then the destination needs to be the
4327 	 * same as the connected one, which is not the case here since we
4328 	 * checked for that above.
4329 	 */
4330 	if (icmp->icmp_state == TS_DATA_XFER) {
4331 		mutex_exit(&connp->conn_lock);
4332 		error = EISCONN;
4333 		goto ud_error;
4334 	}
4335 
4336 	/* In case previous destination was multicast or multirt */
4337 	ip_attr_newdst(ixa);
4338 
4339 	/*
4340 	 * If laddr is unspecified then we look at sin6_src_id.
4341 	 * We will give precedence to a source address set with IPV6_PKTINFO
4342 	 * (aka IPPF_ADDR) but that is handled in build_hdrs. However, we don't
4343 	 * want ip_attr_connect to select a source (since it can fail) when
4344 	 * IPV6_PKTINFO is specified.
4345 	 * If this doesn't result in a source address then we get a source
4346 	 * from ip_attr_connect() below.
4347 	 */
4348 	v6src = connp->conn_saddr_v6;
4349 	if (sin != NULL) {
4350 		IN6_IPADDR_TO_V4MAPPED(sin->sin_addr.s_addr, &v6dst);
4351 		dstport = sin->sin_port;
4352 		flowinfo = 0;
4353 		srcid = 0;
4354 		ixa->ixa_flags &= ~IXAF_SCOPEID_SET;
4355 		if (srcid != 0 && V4_PART_OF_V6(&v6src) == INADDR_ANY) {
4356 			ip_srcid_find_id(srcid, &v6src, IPCL_ZONEID(connp),
4357 			    connp->conn_netstack);
4358 		}
4359 		ixa->ixa_flags |= IXAF_IS_IPV4;
4360 	} else {
4361 		v6dst = sin6->sin6_addr;
4362 		dstport = sin6->sin6_port;
4363 		flowinfo = sin6->sin6_flowinfo;
4364 		srcid = sin6->__sin6_src_id;
4365 		if (IN6_IS_ADDR_LINKSCOPE(&v6dst) && sin6->sin6_scope_id != 0) {
4366 			ixa->ixa_scopeid = sin6->sin6_scope_id;
4367 			ixa->ixa_flags |= IXAF_SCOPEID_SET;
4368 		} else {
4369 			ixa->ixa_flags &= ~IXAF_SCOPEID_SET;
4370 		}
4371 		if (srcid != 0 && IN6_IS_ADDR_UNSPECIFIED(&v6src)) {
4372 			ip_srcid_find_id(srcid, &v6src, IPCL_ZONEID(connp),
4373 			    connp->conn_netstack);
4374 		}
4375 		if (IN6_IS_ADDR_V4MAPPED(&v6dst))
4376 			ixa->ixa_flags |= IXAF_IS_IPV4;
4377 		else
4378 			ixa->ixa_flags &= ~IXAF_IS_IPV4;
4379 	}
4380 	/* Handle IPV6_PKTINFO setting source address. */
4381 	if (IN6_IS_ADDR_UNSPECIFIED(&v6src) &&
4382 	    (connp->conn_xmit_ipp.ipp_fields & IPPF_ADDR)) {
4383 		ip_pkt_t *ipp = &connp->conn_xmit_ipp;
4384 
4385 		if (ixa->ixa_flags & IXAF_IS_IPV4) {
4386 			if (IN6_IS_ADDR_V4MAPPED(&ipp->ipp_addr))
4387 				v6src = ipp->ipp_addr;
4388 		} else {
4389 			if (!IN6_IS_ADDR_V4MAPPED(&ipp->ipp_addr))
4390 				v6src = ipp->ipp_addr;
4391 		}
4392 	}
4393 
4394 	/* Defer IPsec if it might need to look at ICMP type/code */
4395 	switch (ixa->ixa_protocol) {
4396 	case IPPROTO_ICMP:
4397 	case IPPROTO_ICMPV6:
4398 		do_ipsec = B_FALSE;
4399 		break;
4400 	default:
4401 		do_ipsec = B_TRUE;
4402 	}
4403 
4404 	ip_attr_nexthop(&connp->conn_xmit_ipp, ixa, &v6dst, &v6nexthop);
4405 	mutex_exit(&connp->conn_lock);
4406 
4407 	error = ip_attr_connect(connp, ixa, &v6src, &v6dst, &v6nexthop, dstport,
4408 	    &v6src, NULL, IPDF_ALLOW_MCBC | IPDF_VERIFY_DST |
4409 	    (do_ipsec ? IPDF_IPSEC : 0));
4410 	switch (error) {
4411 	case 0:
4412 		break;
4413 	case EADDRNOTAVAIL:
4414 		/*
4415 		 * IXAF_VERIFY_SOURCE tells us to pick a better source.
4416 		 * Don't have the application see that errno
4417 		 */
4418 		error = ENETUNREACH;
4419 		goto failed;
4420 	case ENETDOWN:
4421 		/*
4422 		 * Have !ipif_addr_ready address; drop packet silently
4423 		 * until we can get applications to not send until we
4424 		 * are ready.
4425 		 */
4426 		error = 0;
4427 		goto failed;
4428 	case EHOSTUNREACH:
4429 	case ENETUNREACH:
4430 		if (ixa->ixa_ire != NULL) {
4431 			/*
4432 			 * Let conn_ip_output/ire_send_noroute return
4433 			 * the error and send any local ICMP error.
4434 			 */
4435 			error = 0;
4436 			break;
4437 		}
4438 		/* FALLTHRU */
4439 	default:
4440 	failed:
4441 		goto ud_error;
4442 	}
4443 
4444 	mutex_enter(&connp->conn_lock);
4445 	/*
4446 	 * While we dropped the lock some other thread might have connected
4447 	 * this socket. If so we bail out with EISCONN to ensure that the
4448 	 * connecting thread is the one that updates conn_ixa, conn_ht_*
4449 	 * and conn_*last*.
4450 	 */
4451 	if (icmp->icmp_state == TS_DATA_XFER) {
4452 		mutex_exit(&connp->conn_lock);
4453 		error = EISCONN;
4454 		goto ud_error;
4455 	}
4456 
4457 	/*
4458 	 * We need to rebuild the headers if
4459 	 *  - we are labeling packets (could be different for different
4460 	 *    destinations)
4461 	 *  - we have a source route (or routing header) since we need to
4462 	 *    massage that to get the pseudo-header checksum
4463 	 *  - a socket option with COA_HEADER_CHANGED has been set which
4464 	 *    set conn_v6lastdst to zero.
4465 	 *
4466 	 * Otherwise the prepend function will just update the src, dst,
4467 	 * and flow label.
4468 	 */
4469 	if (is_system_labeled()) {
4470 		/* TX MLP requires SCM_UCRED and don't have that here */
4471 		if (connp->conn_mlp_type != mlptSingle) {
4472 			mutex_exit(&connp->conn_lock);
4473 			error = ECONNREFUSED;
4474 			goto ud_error;
4475 		}
4476 		/*
4477 		 * Check whether Trusted Solaris policy allows communication
4478 		 * with this host, and pretend that the destination is
4479 		 * unreachable if not.
4480 		 * Compute any needed label and place it in ipp_label_v4/v6.
4481 		 *
4482 		 * Later conn_build_hdr_template/conn_prepend_hdr takes
4483 		 * ipp_label_v4/v6 to form the packet.
4484 		 *
4485 		 * Tsol note: Since we hold conn_lock we know no other
4486 		 * thread manipulates conn_xmit_ipp.
4487 		 */
4488 		error = conn_update_label(connp, ixa, &v6dst,
4489 		    &connp->conn_xmit_ipp);
4490 		if (error != 0) {
4491 			mutex_exit(&connp->conn_lock);
4492 			goto ud_error;
4493 		}
4494 		/* Rebuild the header template */
4495 		error = icmp_build_hdr_template(connp, &v6src, &v6dst,
4496 		    flowinfo);
4497 		if (error != 0) {
4498 			mutex_exit(&connp->conn_lock);
4499 			goto ud_error;
4500 		}
4501 	} else if (connp->conn_xmit_ipp.ipp_fields &
4502 	    (IPPF_IPV4_OPTIONS|IPPF_RTHDR) ||
4503 	    IN6_IS_ADDR_UNSPECIFIED(&connp->conn_v6lastdst)) {
4504 		/* Rebuild the header template */
4505 		error = icmp_build_hdr_template(connp, &v6src, &v6dst,
4506 		    flowinfo);
4507 		if (error != 0) {
4508 			mutex_exit(&connp->conn_lock);
4509 			goto ud_error;
4510 		}
4511 	} else {
4512 		/* Simply update the destination address if no source route */
4513 		if (ixa->ixa_flags & IXAF_IS_IPV4) {
4514 			ipha_t	*ipha = (ipha_t *)connp->conn_ht_iphc;
4515 
4516 			IN6_V4MAPPED_TO_IPADDR(&v6dst, ipha->ipha_dst);
4517 			if (ixa->ixa_flags & IXAF_PMTU_IPV4_DF) {
4518 				ipha->ipha_fragment_offset_and_flags |=
4519 				    IPH_DF_HTONS;
4520 			} else {
4521 				ipha->ipha_fragment_offset_and_flags &=
4522 				    ~IPH_DF_HTONS;
4523 			}
4524 		} else {
4525 			ip6_t *ip6h = (ip6_t *)connp->conn_ht_iphc;
4526 			ip6h->ip6_dst = v6dst;
4527 		}
4528 	}
4529 
4530 	/*
4531 	 * Remember the dst etc which corresponds to the built header
4532 	 * template and conn_ixa.
4533 	 */
4534 	oldixa = conn_replace_ixa(connp, ixa);
4535 	connp->conn_v6lastdst = v6dst;
4536 	connp->conn_lastflowinfo = flowinfo;
4537 	connp->conn_lastscopeid = ixa->ixa_scopeid;
4538 	connp->conn_lastsrcid = srcid;
4539 	/* Also remember a source to use together with lastdst */
4540 	connp->conn_v6lastsrc = v6src;
4541 
4542 	data_mp = icmp_prepend_header_template(connp, ixa, data_mp, &v6src,
4543 	    flowinfo, &error);
4544 
4545 	/* Done with conn_t */
4546 	mutex_exit(&connp->conn_lock);
4547 	ixa_refrele(oldixa);
4548 
4549 	if (data_mp == NULL) {
4550 		ASSERT(error != 0);
4551 		goto ud_error;
4552 	}
4553 
4554 	if (!do_ipsec) {
4555 		/* Policy might differ for different ICMP type/code */
4556 		data_mp = icmp_output_attach_policy(data_mp, connp, ixa);
4557 		if (data_mp == NULL) {
4558 			BUMP_MIB(&is->is_rawip_mib, rawipOutErrors);
4559 			error = EHOSTUNREACH;	/* IPsec policy failure */
4560 			goto done;
4561 		}
4562 	}
4563 
4564 	/* We're done.  Pass the packet to ip. */
4565 	BUMP_MIB(&is->is_rawip_mib, rawipOutDatagrams);
4566 
4567 	error = conn_ip_output(data_mp, ixa);
4568 	/* No rawipOutErrors if an error since IP increases its error counter */
4569 	switch (error) {
4570 	case 0:
4571 		break;
4572 	case EWOULDBLOCK:
4573 		(void) ixa_check_drain_insert(connp, ixa);
4574 		error = 0;
4575 		break;
4576 	case EADDRNOTAVAIL:
4577 		/*
4578 		 * IXAF_VERIFY_SOURCE tells us to pick a better source.
4579 		 * Don't have the application see that errno
4580 		 */
4581 		error = ENETUNREACH;
4582 		/* FALLTHRU */
4583 	default:
4584 		mutex_enter(&connp->conn_lock);
4585 		/*
4586 		 * Clear the source and v6lastdst so we call ip_attr_connect
4587 		 * for the next packet and try to pick a better source.
4588 		 */
4589 		if (connp->conn_mcbc_bind)
4590 			connp->conn_saddr_v6 = ipv6_all_zeros;
4591 		else
4592 			connp->conn_saddr_v6 = connp->conn_bound_addr_v6;
4593 		connp->conn_v6lastdst = ipv6_all_zeros;
4594 		mutex_exit(&connp->conn_lock);
4595 		break;
4596 	}
4597 done:
4598 	ixa_refrele(ixa);
4599 	return (error);
4600 
4601 ud_error:
4602 	if (ixa != NULL)
4603 		ixa_refrele(ixa);
4604 
4605 	BUMP_MIB(&is->is_rawip_mib, rawipOutErrors);
4606 	freemsg(data_mp);
4607 	return (error);
4608 }
4609 
4610 /* ARGSUSED */
4611 static void
4612 icmp_wput_fallback(queue_t *q, mblk_t *mp)
4613 {
4614 #ifdef DEBUG
4615 	cmn_err(CE_CONT, "icmp_wput_fallback: Message during fallback \n");
4616 #endif
4617 	freemsg(mp);
4618 }
4619 
4620 static void
4621 icmp_wput_other(queue_t *q, mblk_t *mp)
4622 {
4623 	uchar_t	*rptr = mp->b_rptr;
4624 	struct iocblk *iocp;
4625 	conn_t	*connp = Q_TO_CONN(q);
4626 	icmp_t	*icmp = connp->conn_icmp;
4627 	icmp_stack_t *is = icmp->icmp_is;
4628 	cred_t *cr;
4629 
4630 	switch (mp->b_datap->db_type) {
4631 	case M_PROTO:
4632 	case M_PCPROTO:
4633 		if (mp->b_wptr - rptr < sizeof (t_scalar_t)) {
4634 			/*
4635 			 * If the message does not contain a PRIM_type,
4636 			 * throw it away.
4637 			 */
4638 			freemsg(mp);
4639 			return;
4640 		}
4641 		switch (((t_primp_t)rptr)->type) {
4642 		case T_ADDR_REQ:
4643 			icmp_addr_req(q, mp);
4644 			return;
4645 		case O_T_BIND_REQ:
4646 		case T_BIND_REQ:
4647 			icmp_tpi_bind(q, mp);
4648 			return;
4649 		case T_CONN_REQ:
4650 			icmp_tpi_connect(q, mp);
4651 			return;
4652 		case T_CAPABILITY_REQ:
4653 			icmp_capability_req(q, mp);
4654 			return;
4655 		case T_INFO_REQ:
4656 			icmp_info_req(q, mp);
4657 			return;
4658 		case T_UNITDATA_REQ:
4659 			/*
4660 			 * If a T_UNITDATA_REQ gets here, the address must
4661 			 * be bad.  Valid T_UNITDATA_REQs are handled
4662 			 * in icmp_wput.
4663 			 */
4664 			icmp_ud_err(q, mp, EADDRNOTAVAIL);
4665 			return;
4666 		case T_UNBIND_REQ:
4667 			icmp_tpi_unbind(q, mp);
4668 			return;
4669 		case T_SVR4_OPTMGMT_REQ:
4670 			/*
4671 			 * All Solaris components should pass a db_credp
4672 			 * for this TPI message, hence we ASSERT.
4673 			 * But in case there is some other M_PROTO that looks
4674 			 * like a TPI message sent by some other kernel
4675 			 * component, we check and return an error.
4676 			 */
4677 			cr = msg_getcred(mp, NULL);
4678 			ASSERT(cr != NULL);
4679 			if (cr == NULL) {
4680 				icmp_err_ack(q, mp, TSYSERR, EINVAL);
4681 				return;
4682 			}
4683 
4684 			if (!snmpcom_req(q, mp, icmp_snmp_set, ip_snmp_get,
4685 			    cr)) {
4686 				svr4_optcom_req(q, mp, cr, &icmp_opt_obj);
4687 			}
4688 			return;
4689 
4690 		case T_OPTMGMT_REQ:
4691 			/*
4692 			 * All Solaris components should pass a db_credp
4693 			 * for this TPI message, hence we ASSERT.
4694 			 * But in case there is some other M_PROTO that looks
4695 			 * like a TPI message sent by some other kernel
4696 			 * component, we check and return an error.
4697 			 */
4698 			cr = msg_getcred(mp, NULL);
4699 			ASSERT(cr != NULL);
4700 			if (cr == NULL) {
4701 				icmp_err_ack(q, mp, TSYSERR, EINVAL);
4702 				return;
4703 			}
4704 			tpi_optcom_req(q, mp, cr, &icmp_opt_obj);
4705 			return;
4706 
4707 		case T_DISCON_REQ:
4708 			icmp_tpi_disconnect(q, mp);
4709 			return;
4710 
4711 		/* The following TPI message is not supported by icmp. */
4712 		case O_T_CONN_RES:
4713 		case T_CONN_RES:
4714 			icmp_err_ack(q, mp, TNOTSUPPORT, 0);
4715 			return;
4716 
4717 		/* The following 3 TPI requests are illegal for icmp. */
4718 		case T_DATA_REQ:
4719 		case T_EXDATA_REQ:
4720 		case T_ORDREL_REQ:
4721 			icmp_err_ack(q, mp, TNOTSUPPORT, 0);
4722 			return;
4723 		default:
4724 			break;
4725 		}
4726 		break;
4727 	case M_FLUSH:
4728 		if (*rptr & FLUSHW)
4729 			flushq(q, FLUSHDATA);
4730 		break;
4731 	case M_IOCTL:
4732 		iocp = (struct iocblk *)mp->b_rptr;
4733 		switch (iocp->ioc_cmd) {
4734 		case TI_GETPEERNAME:
4735 			if (icmp->icmp_state != TS_DATA_XFER) {
4736 				/*
4737 				 * If a default destination address has not
4738 				 * been associated with the stream, then we
4739 				 * don't know the peer's name.
4740 				 */
4741 				iocp->ioc_error = ENOTCONN;
4742 				iocp->ioc_count = 0;
4743 				mp->b_datap->db_type = M_IOCACK;
4744 				qreply(q, mp);
4745 				return;
4746 			}
4747 			/* FALLTHRU */
4748 		case TI_GETMYNAME:
4749 			/*
4750 			 * For TI_GETPEERNAME and TI_GETMYNAME, we first
4751 			 * need to copyin the user's strbuf structure.
4752 			 * Processing will continue in the M_IOCDATA case
4753 			 * below.
4754 			 */
4755 			mi_copyin(q, mp, NULL,
4756 			    SIZEOF_STRUCT(strbuf, iocp->ioc_flag));
4757 			return;
4758 		case ND_SET:
4759 			/* nd_getset performs the necessary checking */
4760 		case ND_GET:
4761 			if (nd_getset(q, is->is_nd, mp)) {
4762 				qreply(q, mp);
4763 				return;
4764 			}
4765 			break;
4766 		default:
4767 			break;
4768 		}
4769 		break;
4770 	case M_IOCDATA:
4771 		icmp_wput_iocdata(q, mp);
4772 		return;
4773 	default:
4774 		/* Unrecognized messages are passed through without change. */
4775 		break;
4776 	}
4777 	ip_wput_nondata(q, mp);
4778 }
4779 
4780 /*
4781  * icmp_wput_iocdata is called by icmp_wput_other to handle all M_IOCDATA
4782  * messages.
4783  */
4784 static void
4785 icmp_wput_iocdata(queue_t *q, mblk_t *mp)
4786 {
4787 	mblk_t		*mp1;
4788 	STRUCT_HANDLE(strbuf, sb);
4789 	uint_t		addrlen;
4790 	conn_t		*connp = Q_TO_CONN(q);
4791 	icmp_t		*icmp = connp->conn_icmp;
4792 
4793 	/* Make sure it is one of ours. */
4794 	switch (((struct iocblk *)mp->b_rptr)->ioc_cmd) {
4795 	case TI_GETMYNAME:
4796 	case TI_GETPEERNAME:
4797 		break;
4798 	default:
4799 		ip_wput_nondata(q, mp);
4800 		return;
4801 	}
4802 
4803 	switch (mi_copy_state(q, mp, &mp1)) {
4804 	case -1:
4805 		return;
4806 	case MI_COPY_CASE(MI_COPY_IN, 1):
4807 		break;
4808 	case MI_COPY_CASE(MI_COPY_OUT, 1):
4809 		/*
4810 		 * The address has been copied out, so now
4811 		 * copyout the strbuf.
4812 		 */
4813 		mi_copyout(q, mp);
4814 		return;
4815 	case MI_COPY_CASE(MI_COPY_OUT, 2):
4816 		/*
4817 		 * The address and strbuf have been copied out.
4818 		 * We're done, so just acknowledge the original
4819 		 * M_IOCTL.
4820 		 */
4821 		mi_copy_done(q, mp, 0);
4822 		return;
4823 	default:
4824 		/*
4825 		 * Something strange has happened, so acknowledge
4826 		 * the original M_IOCTL with an EPROTO error.
4827 		 */
4828 		mi_copy_done(q, mp, EPROTO);
4829 		return;
4830 	}
4831 
4832 	/*
4833 	 * Now we have the strbuf structure for TI_GETMYNAME
4834 	 * and TI_GETPEERNAME.  Next we copyout the requested
4835 	 * address and then we'll copyout the strbuf.
4836 	 */
4837 	STRUCT_SET_HANDLE(sb, ((struct iocblk *)mp->b_rptr)->ioc_flag,
4838 	    (void *)mp1->b_rptr);
4839 
4840 	if (connp->conn_family == AF_INET)
4841 		addrlen = sizeof (sin_t);
4842 	else
4843 		addrlen = sizeof (sin6_t);
4844 
4845 	if (STRUCT_FGET(sb, maxlen) < addrlen) {
4846 		mi_copy_done(q, mp, EINVAL);
4847 		return;
4848 	}
4849 	switch (((struct iocblk *)mp->b_rptr)->ioc_cmd) {
4850 	case TI_GETMYNAME:
4851 		break;
4852 	case TI_GETPEERNAME:
4853 		if (icmp->icmp_state != TS_DATA_XFER) {
4854 			mi_copy_done(q, mp, ENOTCONN);
4855 			return;
4856 		}
4857 		break;
4858 	default:
4859 		mi_copy_done(q, mp, EPROTO);
4860 		return;
4861 	}
4862 	mp1 = mi_copyout_alloc(q, mp, STRUCT_FGETP(sb, buf), addrlen, B_TRUE);
4863 	if (!mp1)
4864 		return;
4865 
4866 	STRUCT_FSET(sb, len, addrlen);
4867 	switch (((struct iocblk *)mp->b_rptr)->ioc_cmd) {
4868 	case TI_GETMYNAME:
4869 		(void) conn_getsockname(connp, (struct sockaddr *)mp1->b_wptr,
4870 		    &addrlen);
4871 		break;
4872 	case TI_GETPEERNAME:
4873 		(void) conn_getpeername(connp, (struct sockaddr *)mp1->b_wptr,
4874 		    &addrlen);
4875 		break;
4876 	}
4877 	mp1->b_wptr += addrlen;
4878 	/* Copy out the address */
4879 	mi_copyout(q, mp);
4880 }
4881 
4882 void
4883 icmp_ddi_g_init(void)
4884 {
4885 	icmp_max_optsize = optcom_max_optsize(icmp_opt_obj.odb_opt_des_arr,
4886 	    icmp_opt_obj.odb_opt_arr_cnt);
4887 
4888 	/*
4889 	 * We want to be informed each time a stack is created or
4890 	 * destroyed in the kernel, so we can maintain the
4891 	 * set of icmp_stack_t's.
4892 	 */
4893 	netstack_register(NS_ICMP, rawip_stack_init, NULL, rawip_stack_fini);
4894 }
4895 
4896 void
4897 icmp_ddi_g_destroy(void)
4898 {
4899 	netstack_unregister(NS_ICMP);
4900 }
4901 
4902 #define	INET_NAME	"ip"
4903 
4904 /*
4905  * Initialize the ICMP stack instance.
4906  */
4907 static void *
4908 rawip_stack_init(netstackid_t stackid, netstack_t *ns)
4909 {
4910 	icmp_stack_t	*is;
4911 	icmpparam_t	*pa;
4912 	int		error = 0;
4913 	major_t		major;
4914 
4915 	is = (icmp_stack_t *)kmem_zalloc(sizeof (*is), KM_SLEEP);
4916 	is->is_netstack = ns;
4917 
4918 	pa = (icmpparam_t *)kmem_alloc(sizeof (icmp_param_arr), KM_SLEEP);
4919 	is->is_param_arr = pa;
4920 	bcopy(icmp_param_arr, is->is_param_arr, sizeof (icmp_param_arr));
4921 
4922 	(void) icmp_param_register(&is->is_nd,
4923 	    is->is_param_arr, A_CNT(icmp_param_arr));
4924 	is->is_ksp = rawip_kstat_init(stackid);
4925 
4926 	major = mod_name_to_major(INET_NAME);
4927 	error = ldi_ident_from_major(major, &is->is_ldi_ident);
4928 	ASSERT(error == 0);
4929 	return (is);
4930 }
4931 
4932 /*
4933  * Free the ICMP stack instance.
4934  */
4935 static void
4936 rawip_stack_fini(netstackid_t stackid, void *arg)
4937 {
4938 	icmp_stack_t *is = (icmp_stack_t *)arg;
4939 
4940 	nd_free(&is->is_nd);
4941 	kmem_free(is->is_param_arr, sizeof (icmp_param_arr));
4942 	is->is_param_arr = NULL;
4943 
4944 	rawip_kstat_fini(stackid, is->is_ksp);
4945 	is->is_ksp = NULL;
4946 	ldi_ident_release(is->is_ldi_ident);
4947 	kmem_free(is, sizeof (*is));
4948 }
4949 
4950 static void *
4951 rawip_kstat_init(netstackid_t stackid) {
4952 	kstat_t	*ksp;
4953 
4954 	rawip_named_kstat_t template = {
4955 		{ "inDatagrams",	KSTAT_DATA_UINT32, 0 },
4956 		{ "inCksumErrs",	KSTAT_DATA_UINT32, 0 },
4957 		{ "inErrors",		KSTAT_DATA_UINT32, 0 },
4958 		{ "outDatagrams",	KSTAT_DATA_UINT32, 0 },
4959 		{ "outErrors",		KSTAT_DATA_UINT32, 0 },
4960 	};
4961 
4962 	ksp = kstat_create_netstack("icmp", 0, "rawip", "mib2",
4963 					KSTAT_TYPE_NAMED,
4964 					NUM_OF_FIELDS(rawip_named_kstat_t),
4965 					0, stackid);
4966 	if (ksp == NULL || ksp->ks_data == NULL)
4967 		return (NULL);
4968 
4969 	bcopy(&template, ksp->ks_data, sizeof (template));
4970 	ksp->ks_update = rawip_kstat_update;
4971 	ksp->ks_private = (void *)(uintptr_t)stackid;
4972 
4973 	kstat_install(ksp);
4974 	return (ksp);
4975 }
4976 
4977 static void
4978 rawip_kstat_fini(netstackid_t stackid, kstat_t *ksp)
4979 {
4980 	if (ksp != NULL) {
4981 		ASSERT(stackid == (netstackid_t)(uintptr_t)ksp->ks_private);
4982 		kstat_delete_netstack(ksp, stackid);
4983 	}
4984 }
4985 
4986 static int
4987 rawip_kstat_update(kstat_t *ksp, int rw)
4988 {
4989 	rawip_named_kstat_t *rawipkp;
4990 	netstackid_t	stackid = (netstackid_t)(uintptr_t)ksp->ks_private;
4991 	netstack_t	*ns;
4992 	icmp_stack_t	*is;
4993 
4994 	if ((ksp == NULL) || (ksp->ks_data == NULL))
4995 		return (EIO);
4996 
4997 	if (rw == KSTAT_WRITE)
4998 		return (EACCES);
4999 
5000 	rawipkp = (rawip_named_kstat_t *)ksp->ks_data;
5001 
5002 	ns = netstack_find_by_stackid(stackid);
5003 	if (ns == NULL)
5004 		return (-1);
5005 	is = ns->netstack_icmp;
5006 	if (is == NULL) {
5007 		netstack_rele(ns);
5008 		return (-1);
5009 	}
5010 	rawipkp->inDatagrams.value.ui32 =  is->is_rawip_mib.rawipInDatagrams;
5011 	rawipkp->inCksumErrs.value.ui32 =  is->is_rawip_mib.rawipInCksumErrs;
5012 	rawipkp->inErrors.value.ui32 =	   is->is_rawip_mib.rawipInErrors;
5013 	rawipkp->outDatagrams.value.ui32 = is->is_rawip_mib.rawipOutDatagrams;
5014 	rawipkp->outErrors.value.ui32 =	   is->is_rawip_mib.rawipOutErrors;
5015 	netstack_rele(ns);
5016 	return (0);
5017 }
5018 
5019 /* ARGSUSED */
5020 int
5021 rawip_accept(sock_lower_handle_t lproto_handle,
5022     sock_lower_handle_t eproto_handle, sock_upper_handle_t sock_handle,
5023     cred_t *cr)
5024 {
5025 	return (EOPNOTSUPP);
5026 }
5027 
5028 /* ARGSUSED */
5029 int
5030 rawip_bind(sock_lower_handle_t proto_handle, struct sockaddr *sa,
5031     socklen_t len, cred_t *cr)
5032 {
5033 	conn_t  *connp = (conn_t *)proto_handle;
5034 	int	error;
5035 
5036 	/* All Solaris components should pass a cred for this operation. */
5037 	ASSERT(cr != NULL);
5038 
5039 	/* Binding to a NULL address really means unbind */
5040 	if (sa == NULL)
5041 		error = rawip_do_unbind(connp);
5042 	else
5043 		error = rawip_do_bind(connp, sa, len);
5044 
5045 	if (error < 0) {
5046 		if (error == -TOUTSTATE)
5047 			error = EINVAL;
5048 		else
5049 			error = proto_tlitosyserr(-error);
5050 	}
5051 	return (error);
5052 }
5053 
5054 static int
5055 rawip_implicit_bind(conn_t *connp)
5056 {
5057 	sin6_t sin6addr;
5058 	sin_t *sin;
5059 	sin6_t *sin6;
5060 	socklen_t len;
5061 	int error;
5062 
5063 	if (connp->conn_family == AF_INET) {
5064 		len = sizeof (struct sockaddr_in);
5065 		sin = (sin_t *)&sin6addr;
5066 		*sin = sin_null;
5067 		sin->sin_family = AF_INET;
5068 		sin->sin_addr.s_addr = INADDR_ANY;
5069 	} else {
5070 		ASSERT(connp->conn_family == AF_INET6);
5071 		len = sizeof (sin6_t);
5072 		sin6 = (sin6_t *)&sin6addr;
5073 		*sin6 = sin6_null;
5074 		sin6->sin6_family = AF_INET6;
5075 		V6_SET_ZERO(sin6->sin6_addr);
5076 	}
5077 
5078 	error = rawip_do_bind(connp, (struct sockaddr *)&sin6addr, len);
5079 
5080 	return ((error < 0) ? proto_tlitosyserr(-error) : error);
5081 }
5082 
5083 static int
5084 rawip_unbind(conn_t *connp)
5085 {
5086 	int error;
5087 
5088 	error = rawip_do_unbind(connp);
5089 	if (error < 0) {
5090 		error = proto_tlitosyserr(-error);
5091 	}
5092 	return (error);
5093 }
5094 
5095 /* ARGSUSED */
5096 int
5097 rawip_listen(sock_lower_handle_t proto_handle, int backlog, cred_t *cr)
5098 {
5099 	return (EOPNOTSUPP);
5100 }
5101 
5102 int
5103 rawip_connect(sock_lower_handle_t proto_handle, const struct sockaddr *sa,
5104     socklen_t len, sock_connid_t *id, cred_t *cr)
5105 {
5106 	conn_t	*connp = (conn_t *)proto_handle;
5107 	icmp_t *icmp = connp->conn_icmp;
5108 	int	error;
5109 	boolean_t did_bind = B_FALSE;
5110 	pid_t	pid = curproc->p_pid;
5111 
5112 	/* All Solaris components should pass a cred for this operation. */
5113 	ASSERT(cr != NULL);
5114 
5115 	if (sa == NULL) {
5116 		/*
5117 		 * Disconnect
5118 		 * Make sure we are connected
5119 		 */
5120 		if (icmp->icmp_state != TS_DATA_XFER)
5121 			return (EINVAL);
5122 
5123 		error = icmp_disconnect(connp);
5124 		return (error);
5125 	}
5126 
5127 	error = proto_verify_ip_addr(connp->conn_family, sa, len);
5128 	if (error != 0)
5129 		return (error);
5130 
5131 	/* do an implicit bind if necessary */
5132 	if (icmp->icmp_state == TS_UNBND) {
5133 		error = rawip_implicit_bind(connp);
5134 		/*
5135 		 * We could be racing with an actual bind, in which case
5136 		 * we would see EPROTO. We cross our fingers and try
5137 		 * to connect.
5138 		 */
5139 		if (!(error == 0 || error == EPROTO))
5140 			return (error);
5141 		did_bind = B_TRUE;
5142 	}
5143 
5144 	/*
5145 	 * set SO_DGRAM_ERRIND
5146 	 */
5147 	connp->conn_dgram_errind = B_TRUE;
5148 
5149 	error = rawip_do_connect(connp, sa, len, cr, pid);
5150 	if (error != 0 && did_bind) {
5151 		int unbind_err;
5152 
5153 		unbind_err = rawip_unbind(connp);
5154 		ASSERT(unbind_err == 0);
5155 	}
5156 
5157 	if (error == 0) {
5158 		*id = 0;
5159 		(*connp->conn_upcalls->su_connected)(connp->conn_upper_handle,
5160 		    0, NULL, -1);
5161 	} else if (error < 0) {
5162 		error = proto_tlitosyserr(-error);
5163 	}
5164 	return (error);
5165 }
5166 
5167 /* ARGSUSED2 */
5168 int
5169 rawip_fallback(sock_lower_handle_t proto_handle, queue_t *q,
5170     boolean_t direct_sockfs, so_proto_quiesced_cb_t quiesced_cb)
5171 {
5172 	conn_t  *connp = (conn_t *)proto_handle;
5173 	icmp_t	*icmp;
5174 	struct T_capability_ack tca;
5175 	struct sockaddr_in6 laddr, faddr;
5176 	socklen_t laddrlen, faddrlen;
5177 	short opts;
5178 	struct stroptions *stropt;
5179 	mblk_t *stropt_mp;
5180 	int error;
5181 
5182 	icmp = connp->conn_icmp;
5183 
5184 	stropt_mp = allocb_wait(sizeof (*stropt), BPRI_HI, STR_NOSIG, NULL);
5185 
5186 	/*
5187 	 * setup the fallback stream that was allocated
5188 	 */
5189 	connp->conn_dev = (dev_t)RD(q)->q_ptr;
5190 	connp->conn_minor_arena = WR(q)->q_ptr;
5191 
5192 	RD(q)->q_ptr = WR(q)->q_ptr = connp;
5193 
5194 	WR(q)->q_qinfo = &icmpwinit;
5195 
5196 	connp->conn_rq = RD(q);
5197 	connp->conn_wq = WR(q);
5198 
5199 	/* Notify stream head about options before sending up data */
5200 	stropt_mp->b_datap->db_type = M_SETOPTS;
5201 	stropt_mp->b_wptr += sizeof (*stropt);
5202 	stropt = (struct stroptions *)stropt_mp->b_rptr;
5203 	stropt->so_flags = SO_WROFF | SO_HIWAT;
5204 	stropt->so_wroff = connp->conn_wroff;
5205 	stropt->so_hiwat = connp->conn_rcvbuf;
5206 	putnext(RD(q), stropt_mp);
5207 
5208 	/*
5209 	 * free helper stream
5210 	 */
5211 	ip_free_helper_stream(connp);
5212 
5213 	/*
5214 	 * Collect the information needed to sync with the sonode
5215 	 */
5216 	icmp_do_capability_ack(icmp, &tca, TC1_INFO);
5217 
5218 	laddrlen = faddrlen = sizeof (sin6_t);
5219 	(void) rawip_getsockname((sock_lower_handle_t)connp,
5220 	    (struct sockaddr *)&laddr, &laddrlen, CRED());
5221 	error = rawip_getpeername((sock_lower_handle_t)connp,
5222 	    (struct sockaddr *)&faddr, &faddrlen, CRED());
5223 	if (error != 0)
5224 		faddrlen = 0;
5225 	opts = 0;
5226 	if (connp->conn_dgram_errind)
5227 		opts |= SO_DGRAM_ERRIND;
5228 	if (connp->conn_ixa->ixa_flags & IXAF_DONTROUTE)
5229 		opts |= SO_DONTROUTE;
5230 
5231 	(*quiesced_cb)(connp->conn_upper_handle, q, &tca,
5232 	    (struct sockaddr *)&laddr, laddrlen,
5233 	    (struct sockaddr *)&faddr, faddrlen, opts);
5234 
5235 	/*
5236 	 * Attempts to send data up during fallback will result in it being
5237 	 * queued in icmp_t. Now we push up any queued packets.
5238 	 */
5239 	mutex_enter(&icmp->icmp_recv_lock);
5240 	while (icmp->icmp_fallback_queue_head != NULL) {
5241 		mblk_t	*mp;
5242 
5243 		mp = icmp->icmp_fallback_queue_head;
5244 		icmp->icmp_fallback_queue_head = mp->b_next;
5245 		mp->b_next = NULL;
5246 		mutex_exit(&icmp->icmp_recv_lock);
5247 		putnext(RD(q), mp);
5248 		mutex_enter(&icmp->icmp_recv_lock);
5249 	}
5250 	icmp->icmp_fallback_queue_tail = icmp->icmp_fallback_queue_head;
5251 
5252 	/*
5253 	 * No longer a streams less socket
5254 	 */
5255 	mutex_enter(&connp->conn_lock);
5256 	connp->conn_flags &= ~IPCL_NONSTR;
5257 	mutex_exit(&connp->conn_lock);
5258 
5259 	mutex_exit(&icmp->icmp_recv_lock);
5260 
5261 	ASSERT(icmp->icmp_fallback_queue_head == NULL &&
5262 	    icmp->icmp_fallback_queue_tail == NULL);
5263 
5264 	ASSERT(connp->conn_ref >= 1);
5265 
5266 	return (0);
5267 }
5268 
5269 /* ARGSUSED2 */
5270 sock_lower_handle_t
5271 rawip_create(int family, int type, int proto, sock_downcalls_t **sock_downcalls,
5272     uint_t *smodep, int *errorp, int flags, cred_t *credp)
5273 {
5274 	conn_t *connp;
5275 
5276 	if (type != SOCK_RAW || (family != AF_INET && family != AF_INET6)) {
5277 		*errorp = EPROTONOSUPPORT;
5278 		return (NULL);
5279 	}
5280 
5281 	connp = rawip_do_open(family, credp, errorp, flags);
5282 	if (connp != NULL) {
5283 		connp->conn_flags |= IPCL_NONSTR;
5284 
5285 		mutex_enter(&connp->conn_lock);
5286 		connp->conn_state_flags &= ~CONN_INCIPIENT;
5287 		mutex_exit(&connp->conn_lock);
5288 		*sock_downcalls = &sock_rawip_downcalls;
5289 		*smodep = SM_ATOMIC;
5290 	} else {
5291 		ASSERT(*errorp != 0);
5292 	}
5293 
5294 	return ((sock_lower_handle_t)connp);
5295 }
5296 
5297 /* ARGSUSED3 */
5298 void
5299 rawip_activate(sock_lower_handle_t proto_handle,
5300     sock_upper_handle_t sock_handle, sock_upcalls_t *sock_upcalls, int flags,
5301     cred_t *cr)
5302 {
5303 	conn_t 			*connp = (conn_t *)proto_handle;
5304 	struct sock_proto_props sopp;
5305 
5306 	/* All Solaris components should pass a cred for this operation. */
5307 	ASSERT(cr != NULL);
5308 
5309 	connp->conn_upcalls = sock_upcalls;
5310 	connp->conn_upper_handle = sock_handle;
5311 
5312 	sopp.sopp_flags = SOCKOPT_WROFF | SOCKOPT_RCVHIWAT | SOCKOPT_RCVLOWAT |
5313 	    SOCKOPT_MAXBLK | SOCKOPT_MAXPSZ | SOCKOPT_MINPSZ;
5314 	sopp.sopp_wroff = connp->conn_wroff;
5315 	sopp.sopp_rxhiwat = connp->conn_rcvbuf;
5316 	sopp.sopp_rxlowat = connp->conn_rcvlowat;
5317 	sopp.sopp_maxblk = INFPSZ;
5318 	sopp.sopp_maxpsz = IP_MAXPACKET;
5319 	sopp.sopp_minpsz = (icmp_mod_info.mi_minpsz == 1) ? 0 :
5320 	    icmp_mod_info.mi_minpsz;
5321 
5322 	(*connp->conn_upcalls->su_set_proto_props)
5323 	    (connp->conn_upper_handle, &sopp);
5324 
5325 	icmp_bind_proto(connp->conn_icmp);
5326 }
5327 
5328 /* ARGSUSED3 */
5329 int
5330 rawip_getpeername(sock_lower_handle_t proto_handle, struct sockaddr *sa,
5331     socklen_t *salenp, cred_t *cr)
5332 {
5333 	conn_t  *connp = (conn_t *)proto_handle;
5334 	icmp_t  *icmp = connp->conn_icmp;
5335 	int	error;
5336 
5337 	/* All Solaris components should pass a cred for this operation. */
5338 	ASSERT(cr != NULL);
5339 
5340 	mutex_enter(&connp->conn_lock);
5341 	if (icmp->icmp_state != TS_DATA_XFER)
5342 		error = ENOTCONN;
5343 	else
5344 		error = conn_getpeername(connp, sa, salenp);
5345 	mutex_exit(&connp->conn_lock);
5346 	return (error);
5347 }
5348 
5349 /* ARGSUSED3 */
5350 int
5351 rawip_getsockname(sock_lower_handle_t proto_handle, struct sockaddr *sa,
5352     socklen_t *salenp, cred_t *cr)
5353 {
5354 	conn_t  *connp = (conn_t *)proto_handle;
5355 	int	error;
5356 
5357 	/* All Solaris components should pass a cred for this operation. */
5358 	ASSERT(cr != NULL);
5359 
5360 	mutex_enter(&connp->conn_lock);
5361 	error = conn_getsockname(connp, sa, salenp);
5362 	mutex_exit(&connp->conn_lock);
5363 	return (error);
5364 }
5365 
5366 int
5367 rawip_setsockopt(sock_lower_handle_t proto_handle, int level, int option_name,
5368     const void *optvalp, socklen_t optlen, cred_t *cr)
5369 {
5370 	conn_t	*connp = (conn_t *)proto_handle;
5371 	int error;
5372 
5373 	/* All Solaris components should pass a cred for this operation. */
5374 	ASSERT(cr != NULL);
5375 
5376 	error = proto_opt_check(level, option_name, optlen, NULL,
5377 	    icmp_opt_obj.odb_opt_des_arr,
5378 	    icmp_opt_obj.odb_opt_arr_cnt,
5379 	    B_TRUE, B_FALSE, cr);
5380 
5381 	if (error != 0) {
5382 		/*
5383 		 * option not recognized
5384 		 */
5385 		if (error < 0) {
5386 			error = proto_tlitosyserr(-error);
5387 		}
5388 		return (error);
5389 	}
5390 
5391 	error = icmp_opt_set(connp, SETFN_OPTCOM_NEGOTIATE, level,
5392 	    option_name, optlen, (uchar_t *)optvalp, (uint_t *)&optlen,
5393 	    (uchar_t *)optvalp, NULL, cr);
5394 
5395 	ASSERT(error >= 0);
5396 
5397 	return (error);
5398 }
5399 
5400 int
5401 rawip_getsockopt(sock_lower_handle_t proto_handle, int level, int option_name,
5402     void *optvalp, socklen_t *optlen, cred_t *cr)
5403 {
5404 	int		error;
5405 	conn_t		*connp = (conn_t *)proto_handle;
5406 	t_uscalar_t	max_optbuf_len;
5407 	void		*optvalp_buf;
5408 	int		len;
5409 
5410 	/* All Solaris components should pass a cred for this operation. */
5411 	ASSERT(cr != NULL);
5412 
5413 	error = proto_opt_check(level, option_name, *optlen, &max_optbuf_len,
5414 	    icmp_opt_obj.odb_opt_des_arr,
5415 	    icmp_opt_obj.odb_opt_arr_cnt,
5416 	    B_FALSE, B_TRUE, cr);
5417 
5418 	if (error != 0) {
5419 		if (error < 0) {
5420 			error = proto_tlitosyserr(-error);
5421 		}
5422 		return (error);
5423 	}
5424 
5425 	optvalp_buf = kmem_alloc(max_optbuf_len, KM_SLEEP);
5426 	len = icmp_opt_get(connp, level, option_name, optvalp_buf);
5427 	if (len == -1) {
5428 		kmem_free(optvalp_buf, max_optbuf_len);
5429 		return (EINVAL);
5430 	}
5431 
5432 	/*
5433 	 * update optlen and copy option value
5434 	 */
5435 	t_uscalar_t size = MIN(len, *optlen);
5436 
5437 	bcopy(optvalp_buf, optvalp, size);
5438 	bcopy(&size, optlen, sizeof (size));
5439 
5440 	kmem_free(optvalp_buf, max_optbuf_len);
5441 	return (0);
5442 }
5443 
5444 /* ARGSUSED1 */
5445 int
5446 rawip_close(sock_lower_handle_t proto_handle, int flags, cred_t *cr)
5447 {
5448 	conn_t	*connp = (conn_t *)proto_handle;
5449 
5450 	/* All Solaris components should pass a cred for this operation. */
5451 	ASSERT(cr != NULL);
5452 
5453 	(void) rawip_do_close(connp);
5454 	return (0);
5455 }
5456 
5457 /* ARGSUSED2 */
5458 int
5459 rawip_shutdown(sock_lower_handle_t proto_handle, int how, cred_t *cr)
5460 {
5461 	conn_t  *connp = (conn_t *)proto_handle;
5462 
5463 	/* All Solaris components should pass a cred for this operation. */
5464 	ASSERT(cr != NULL);
5465 
5466 	/* shut down the send side */
5467 	if (how != SHUT_RD)
5468 		(*connp->conn_upcalls->su_opctl)(connp->conn_upper_handle,
5469 		    SOCK_OPCTL_SHUT_SEND, 0);
5470 	/* shut down the recv side */
5471 	if (how != SHUT_WR)
5472 		(*connp->conn_upcalls->su_opctl)(connp->conn_upper_handle,
5473 		    SOCK_OPCTL_SHUT_RECV, 0);
5474 	return (0);
5475 }
5476 
5477 void
5478 rawip_clr_flowctrl(sock_lower_handle_t proto_handle)
5479 {
5480 	conn_t  *connp = (conn_t *)proto_handle;
5481 	icmp_t	*icmp = connp->conn_icmp;
5482 
5483 	mutex_enter(&icmp->icmp_recv_lock);
5484 	connp->conn_flow_cntrld = B_FALSE;
5485 	mutex_exit(&icmp->icmp_recv_lock);
5486 }
5487 
5488 int
5489 rawip_ioctl(sock_lower_handle_t proto_handle, int cmd, intptr_t arg,
5490     int mode, int32_t *rvalp, cred_t *cr)
5491 {
5492 	conn_t  	*connp = (conn_t *)proto_handle;
5493 	int		error;
5494 
5495 	/* All Solaris components should pass a cred for this operation. */
5496 	ASSERT(cr != NULL);
5497 
5498 	/*
5499 	 * If we don't have a helper stream then create one.
5500 	 * ip_create_helper_stream takes care of locking the conn_t,
5501 	 * so this check for NULL is just a performance optimization.
5502 	 */
5503 	if (connp->conn_helper_info == NULL) {
5504 		icmp_stack_t *is = connp->conn_icmp->icmp_is;
5505 
5506 		ASSERT(is->is_ldi_ident != NULL);
5507 
5508 		/*
5509 		 * Create a helper stream for non-STREAMS socket.
5510 		 */
5511 		error = ip_create_helper_stream(connp, is->is_ldi_ident);
5512 		if (error != 0) {
5513 			ip0dbg(("rawip_ioctl: create of IP helper stream "
5514 			    "failed %d\n", error));
5515 			return (error);
5516 		}
5517 	}
5518 
5519 	switch (cmd) {
5520 	case ND_SET:
5521 	case ND_GET:
5522 	case _SIOCSOCKFALLBACK:
5523 	case TI_GETPEERNAME:
5524 	case TI_GETMYNAME:
5525 #ifdef DEBUG
5526 		cmn_err(CE_CONT, "icmp_ioctl cmd 0x%x on non streams"
5527 		    " socket", cmd);
5528 #endif
5529 		error = EINVAL;
5530 		break;
5531 	default:
5532 		/*
5533 		 * Pass on to IP using helper stream
5534 		 */
5535 		error = ldi_ioctl(connp->conn_helper_info->iphs_handle,
5536 		    cmd, arg, mode, cr, rvalp);
5537 		break;
5538 	}
5539 	return (error);
5540 }
5541 
5542 int
5543 rawip_send(sock_lower_handle_t proto_handle, mblk_t *mp, struct nmsghdr *msg,
5544     cred_t *cr)
5545 {
5546 	sin6_t		*sin6;
5547 	sin_t		*sin = NULL;
5548 	uint_t		srcid;
5549 	conn_t		*connp = (conn_t *)proto_handle;
5550 	icmp_t		*icmp = connp->conn_icmp;
5551 	int		error = 0;
5552 	icmp_stack_t	*is = icmp->icmp_is;
5553 	pid_t		pid = curproc->p_pid;
5554 	ip_xmit_attr_t	*ixa;
5555 
5556 	ASSERT(DB_TYPE(mp) == M_DATA);
5557 
5558 	/* All Solaris components should pass a cred for this operation. */
5559 	ASSERT(cr != NULL);
5560 
5561 	/* do an implicit bind if necessary */
5562 	if (icmp->icmp_state == TS_UNBND) {
5563 		error = rawip_implicit_bind(connp);
5564 		/*
5565 		 * We could be racing with an actual bind, in which case
5566 		 * we would see EPROTO. We cross our fingers and try
5567 		 * to connect.
5568 		 */
5569 		if (!(error == 0 || error == EPROTO)) {
5570 			freemsg(mp);
5571 			return (error);
5572 		}
5573 	}
5574 
5575 	/* Protocol 255 contains full IP headers */
5576 	/* Read without holding lock */
5577 	if (icmp->icmp_hdrincl) {
5578 		ASSERT(connp->conn_ipversion == IPV4_VERSION);
5579 		if (mp->b_wptr - mp->b_rptr < IP_SIMPLE_HDR_LENGTH) {
5580 			if (!pullupmsg(mp, IP_SIMPLE_HDR_LENGTH)) {
5581 				BUMP_MIB(&is->is_rawip_mib, rawipOutErrors);
5582 				freemsg(mp);
5583 				return (EINVAL);
5584 			}
5585 		}
5586 		error = icmp_output_hdrincl(connp, mp, cr, pid);
5587 		if (is->is_sendto_ignerr)
5588 			return (0);
5589 		else
5590 			return (error);
5591 	}
5592 
5593 	/* Connected? */
5594 	if (msg->msg_name == NULL) {
5595 		if (icmp->icmp_state != TS_DATA_XFER) {
5596 			BUMP_MIB(&is->is_rawip_mib, rawipOutErrors);
5597 			return (EDESTADDRREQ);
5598 		}
5599 		if (msg->msg_controllen != 0) {
5600 			error = icmp_output_ancillary(connp, NULL, NULL, mp,
5601 			    NULL, msg, cr, pid);
5602 		} else {
5603 			error = icmp_output_connected(connp, mp, cr, pid);
5604 		}
5605 		if (is->is_sendto_ignerr)
5606 			return (0);
5607 		else
5608 			return (error);
5609 	}
5610 	if (icmp->icmp_state == TS_DATA_XFER) {
5611 		BUMP_MIB(&is->is_rawip_mib, rawipOutErrors);
5612 		return (EISCONN);
5613 	}
5614 	error = proto_verify_ip_addr(connp->conn_family,
5615 	    (struct sockaddr *)msg->msg_name, msg->msg_namelen);
5616 	if (error != 0) {
5617 		BUMP_MIB(&is->is_rawip_mib, rawipOutErrors);
5618 		return (error);
5619 	}
5620 	switch (connp->conn_family) {
5621 	case AF_INET6:
5622 		sin6 = (sin6_t *)msg->msg_name;
5623 
5624 		/* No support for mapped addresses on raw sockets */
5625 		if (IN6_IS_ADDR_V4MAPPED(&sin6->sin6_addr)) {
5626 			BUMP_MIB(&is->is_rawip_mib, rawipOutErrors);
5627 			return (EADDRNOTAVAIL);
5628 		}
5629 		srcid = sin6->__sin6_src_id;
5630 
5631 		/*
5632 		 * If the local address is a mapped address return
5633 		 * an error.
5634 		 * It would be possible to send an IPv6 packet but the
5635 		 * response would never make it back to the application
5636 		 * since it is bound to a mapped address.
5637 		 */
5638 		if (IN6_IS_ADDR_V4MAPPED(&connp->conn_saddr_v6)) {
5639 			BUMP_MIB(&is->is_rawip_mib, rawipOutErrors);
5640 			return (EADDRNOTAVAIL);
5641 		}
5642 
5643 		if (IN6_IS_ADDR_UNSPECIFIED(&sin6->sin6_addr))
5644 			sin6->sin6_addr = ipv6_loopback;
5645 
5646 		/*
5647 		 * We have to allocate an ip_xmit_attr_t before we grab
5648 		 * conn_lock and we need to hold conn_lock once we've check
5649 		 * conn_same_as_last_v6 to handle concurrent send* calls on a
5650 		 * socket.
5651 		 */
5652 		if (msg->msg_controllen == 0) {
5653 			ixa = conn_get_ixa(connp, B_FALSE);
5654 			if (ixa == NULL) {
5655 				BUMP_MIB(&is->is_rawip_mib, rawipOutErrors);
5656 				return (ENOMEM);
5657 			}
5658 		} else {
5659 			ixa = NULL;
5660 		}
5661 		mutex_enter(&connp->conn_lock);
5662 		if (icmp->icmp_delayed_error != 0) {
5663 			sin6_t  *sin2 = (sin6_t *)&icmp->icmp_delayed_addr;
5664 
5665 			error = icmp->icmp_delayed_error;
5666 			icmp->icmp_delayed_error = 0;
5667 
5668 			/* Compare IP address and family */
5669 
5670 			if (IN6_ARE_ADDR_EQUAL(&sin6->sin6_addr,
5671 			    &sin2->sin6_addr) &&
5672 			    sin6->sin6_family == sin2->sin6_family) {
5673 				mutex_exit(&connp->conn_lock);
5674 				BUMP_MIB(&is->is_rawip_mib, rawipOutErrors);
5675 				if (ixa != NULL)
5676 					ixa_refrele(ixa);
5677 				return (error);
5678 			}
5679 		}
5680 		if (msg->msg_controllen != 0) {
5681 			mutex_exit(&connp->conn_lock);
5682 			ASSERT(ixa == NULL);
5683 			error = icmp_output_ancillary(connp, NULL, sin6, mp,
5684 			    NULL, msg, cr, pid);
5685 		} else if (conn_same_as_last_v6(connp, sin6) &&
5686 		    connp->conn_lastsrcid == srcid &&
5687 		    ipsec_outbound_policy_current(ixa)) {
5688 			/* icmp_output_lastdst drops conn_lock */
5689 			error = icmp_output_lastdst(connp, mp, cr, pid, ixa);
5690 		} else {
5691 			/* icmp_output_newdst drops conn_lock */
5692 			error = icmp_output_newdst(connp, mp, NULL, sin6, cr,
5693 			    pid, ixa);
5694 		}
5695 		ASSERT(MUTEX_NOT_HELD(&connp->conn_lock));
5696 		if (is->is_sendto_ignerr)
5697 			return (0);
5698 		else
5699 			return (error);
5700 	case AF_INET:
5701 		sin = (sin_t *)msg->msg_name;
5702 
5703 		if (sin->sin_addr.s_addr == INADDR_ANY)
5704 			sin->sin_addr.s_addr = htonl(INADDR_LOOPBACK);
5705 
5706 		/*
5707 		 * We have to allocate an ip_xmit_attr_t before we grab
5708 		 * conn_lock and we need to hold conn_lock once we've check
5709 		 * conn_same_as_last_v6 to handle concurrent send* on a socket.
5710 		 */
5711 		if (msg->msg_controllen == 0) {
5712 			ixa = conn_get_ixa(connp, B_FALSE);
5713 			if (ixa == NULL) {
5714 				BUMP_MIB(&is->is_rawip_mib, rawipOutErrors);
5715 				return (ENOMEM);
5716 			}
5717 		} else {
5718 			ixa = NULL;
5719 		}
5720 		mutex_enter(&connp->conn_lock);
5721 		if (icmp->icmp_delayed_error != 0) {
5722 			sin_t  *sin2 = (sin_t *)&icmp->icmp_delayed_addr;
5723 
5724 			error = icmp->icmp_delayed_error;
5725 			icmp->icmp_delayed_error = 0;
5726 
5727 			/* Compare IP address */
5728 
5729 			if (sin->sin_addr.s_addr == sin2->sin_addr.s_addr) {
5730 				mutex_exit(&connp->conn_lock);
5731 				BUMP_MIB(&is->is_rawip_mib, rawipOutErrors);
5732 				if (ixa != NULL)
5733 					ixa_refrele(ixa);
5734 				return (error);
5735 			}
5736 		}
5737 
5738 		if (msg->msg_controllen != 0) {
5739 			mutex_exit(&connp->conn_lock);
5740 			ASSERT(ixa == NULL);
5741 			error = icmp_output_ancillary(connp, sin, NULL, mp,
5742 			    NULL, msg, cr, pid);
5743 		} else if (conn_same_as_last_v4(connp, sin) &&
5744 		    ipsec_outbound_policy_current(ixa)) {
5745 			/* icmp_output_lastdst drops conn_lock */
5746 			error = icmp_output_lastdst(connp, mp, cr, pid, ixa);
5747 		} else {
5748 			/* icmp_output_newdst drops conn_lock */
5749 			error = icmp_output_newdst(connp, mp, sin, NULL, cr,
5750 			    pid, ixa);
5751 		}
5752 		ASSERT(MUTEX_NOT_HELD(&connp->conn_lock));
5753 		if (is->is_sendto_ignerr)
5754 			return (0);
5755 		else
5756 			return (error);
5757 	default:
5758 		return (EINVAL);
5759 	}
5760 }
5761 
5762 sock_downcalls_t sock_rawip_downcalls = {
5763 	rawip_activate,
5764 	rawip_accept,
5765 	rawip_bind,
5766 	rawip_listen,
5767 	rawip_connect,
5768 	rawip_getpeername,
5769 	rawip_getsockname,
5770 	rawip_getsockopt,
5771 	rawip_setsockopt,
5772 	rawip_send,
5773 	NULL,
5774 	NULL,
5775 	NULL,
5776 	rawip_shutdown,
5777 	rawip_clr_flowctrl,
5778 	rawip_ioctl,
5779 	rawip_close
5780 };
5781