1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright 2010 Sun Microsystems, Inc. All rights reserved. 23 * Use is subject to license terms. 24 */ 25 /* Copyright (c) 1990 Mentat Inc. */ 26 27 #include <sys/types.h> 28 #include <sys/stream.h> 29 #include <sys/stropts.h> 30 #include <sys/strlog.h> 31 #include <sys/strsun.h> 32 #define _SUN_TPI_VERSION 2 33 #include <sys/tihdr.h> 34 #include <sys/timod.h> 35 #include <sys/ddi.h> 36 #include <sys/sunddi.h> 37 #include <sys/strsubr.h> 38 #include <sys/suntpi.h> 39 #include <sys/xti_inet.h> 40 #include <sys/cmn_err.h> 41 #include <sys/kmem.h> 42 #include <sys/cred_impl.h> 43 #include <sys/policy.h> 44 #include <sys/priv.h> 45 #include <sys/ucred.h> 46 #include <sys/zone.h> 47 48 #include <sys/sockio.h> 49 #include <sys/socket.h> 50 #include <sys/socketvar.h> 51 #include <sys/vtrace.h> 52 #include <sys/sdt.h> 53 #include <sys/debug.h> 54 #include <sys/isa_defs.h> 55 #include <sys/random.h> 56 #include <netinet/in.h> 57 #include <netinet/ip6.h> 58 #include <netinet/icmp6.h> 59 #include <netinet/udp.h> 60 61 #include <inet/common.h> 62 #include <inet/ip.h> 63 #include <inet/ip_impl.h> 64 #include <inet/ipsec_impl.h> 65 #include <inet/ip6.h> 66 #include <inet/ip_ire.h> 67 #include <inet/ip_if.h> 68 #include <inet/ip_multi.h> 69 #include <inet/ip_ndp.h> 70 #include <inet/proto_set.h> 71 #include <inet/mib2.h> 72 #include <inet/nd.h> 73 #include <inet/optcom.h> 74 #include <inet/snmpcom.h> 75 #include <inet/kstatcom.h> 76 #include <inet/ipclassifier.h> 77 78 #include <sys/tsol/label.h> 79 #include <sys/tsol/tnet.h> 80 81 #include <inet/rawip_impl.h> 82 83 #include <sys/disp.h> 84 85 /* 86 * Synchronization notes: 87 * 88 * RAWIP is MT and uses the usual kernel synchronization primitives. We use 89 * conn_lock to protect the icmp_t. 90 * 91 * Plumbing notes: 92 * ICMP is always a device driver. For compatibility with mibopen() code 93 * it is possible to I_PUSH "icmp", but that results in pushing a passthrough 94 * dummy module. 95 */ 96 97 static void icmp_addr_req(queue_t *q, mblk_t *mp); 98 static void icmp_tpi_bind(queue_t *q, mblk_t *mp); 99 static void icmp_bind_proto(icmp_t *icmp); 100 static int icmp_build_hdr_template(conn_t *, const in6_addr_t *, 101 const in6_addr_t *, uint32_t); 102 static void icmp_capability_req(queue_t *q, mblk_t *mp); 103 static int icmp_close(queue_t *q, int flags); 104 static void icmp_close_free(conn_t *); 105 static void icmp_tpi_connect(queue_t *q, mblk_t *mp); 106 static void icmp_tpi_disconnect(queue_t *q, mblk_t *mp); 107 static void icmp_err_ack(queue_t *q, mblk_t *mp, t_scalar_t t_error, 108 int sys_error); 109 static void icmp_err_ack_prim(queue_t *q, mblk_t *mp, t_scalar_t primitive, 110 t_scalar_t tlierr, int sys_error); 111 static void icmp_icmp_input(void *arg1, mblk_t *mp, void *arg2, 112 ip_recv_attr_t *); 113 static void icmp_icmp_error_ipv6(conn_t *connp, mblk_t *mp, 114 ip_recv_attr_t *); 115 static void icmp_info_req(queue_t *q, mblk_t *mp); 116 static void icmp_input(void *, mblk_t *, void *, ip_recv_attr_t *); 117 static conn_t *icmp_open(int family, cred_t *credp, int *err, int flags); 118 static int icmp_openv4(queue_t *q, dev_t *devp, int flag, int sflag, 119 cred_t *credp); 120 static int icmp_openv6(queue_t *q, dev_t *devp, int flag, int sflag, 121 cred_t *credp); 122 static boolean_t icmp_opt_allow_udr_set(t_scalar_t level, t_scalar_t name); 123 int icmp_opt_set(conn_t *connp, uint_t optset_context, 124 int level, int name, uint_t inlen, 125 uchar_t *invalp, uint_t *outlenp, uchar_t *outvalp, 126 void *thisdg_attrs, cred_t *cr); 127 int icmp_opt_get(conn_t *connp, int level, int name, 128 uchar_t *ptr); 129 static int icmp_output_newdst(conn_t *connp, mblk_t *data_mp, sin_t *sin, 130 sin6_t *sin6, cred_t *cr, pid_t pid, ip_xmit_attr_t *ixa); 131 static mblk_t *icmp_prepend_hdr(conn_t *, ip_xmit_attr_t *, const ip_pkt_t *, 132 const in6_addr_t *, const in6_addr_t *, uint32_t, mblk_t *, int *); 133 static mblk_t *icmp_prepend_header_template(conn_t *, ip_xmit_attr_t *, 134 mblk_t *, const in6_addr_t *, uint32_t, int *); 135 static int icmp_snmp_set(queue_t *q, t_scalar_t level, t_scalar_t name, 136 uchar_t *ptr, int len); 137 static void icmp_ud_err(queue_t *q, mblk_t *mp, t_scalar_t err); 138 static void icmp_tpi_unbind(queue_t *q, mblk_t *mp); 139 static void icmp_wput(queue_t *q, mblk_t *mp); 140 static void icmp_wput_fallback(queue_t *q, mblk_t *mp); 141 static void icmp_wput_other(queue_t *q, mblk_t *mp); 142 static void icmp_wput_iocdata(queue_t *q, mblk_t *mp); 143 static void icmp_wput_restricted(queue_t *q, mblk_t *mp); 144 static void icmp_ulp_recv(conn_t *, mblk_t *, uint_t); 145 146 static void *rawip_stack_init(netstackid_t stackid, netstack_t *ns); 147 static void rawip_stack_fini(netstackid_t stackid, void *arg); 148 149 static void *rawip_kstat_init(netstackid_t stackid); 150 static void rawip_kstat_fini(netstackid_t stackid, kstat_t *ksp); 151 static int rawip_kstat_update(kstat_t *kp, int rw); 152 static void rawip_stack_shutdown(netstackid_t stackid, void *arg); 153 154 /* Common routines for TPI and socket module */ 155 static conn_t *rawip_do_open(int, cred_t *, int *, int); 156 static void rawip_do_close(conn_t *); 157 static int rawip_do_bind(conn_t *, struct sockaddr *, socklen_t); 158 static int rawip_do_unbind(conn_t *); 159 static int rawip_do_connect(conn_t *, const struct sockaddr *, socklen_t, 160 cred_t *, pid_t); 161 162 int rawip_getsockname(sock_lower_handle_t, struct sockaddr *, 163 socklen_t *, cred_t *); 164 int rawip_getpeername(sock_lower_handle_t, struct sockaddr *, 165 socklen_t *, cred_t *); 166 167 static struct module_info icmp_mod_info = { 168 5707, "icmp", 1, INFPSZ, 512, 128 169 }; 170 171 /* 172 * Entry points for ICMP as a device. 173 * We have separate open functions for the /dev/icmp and /dev/icmp6 devices. 174 */ 175 static struct qinit icmprinitv4 = { 176 NULL, NULL, icmp_openv4, icmp_close, NULL, &icmp_mod_info 177 }; 178 179 static struct qinit icmprinitv6 = { 180 NULL, NULL, icmp_openv6, icmp_close, NULL, &icmp_mod_info 181 }; 182 183 static struct qinit icmpwinit = { 184 (pfi_t)icmp_wput, (pfi_t)ip_wsrv, NULL, NULL, NULL, &icmp_mod_info 185 }; 186 187 /* ICMP entry point during fallback */ 188 static struct qinit icmp_fallback_sock_winit = { 189 (pfi_t)icmp_wput_fallback, NULL, NULL, NULL, NULL, &icmp_mod_info 190 }; 191 192 /* For AF_INET aka /dev/icmp */ 193 struct streamtab icmpinfov4 = { 194 &icmprinitv4, &icmpwinit 195 }; 196 197 /* For AF_INET6 aka /dev/icmp6 */ 198 struct streamtab icmpinfov6 = { 199 &icmprinitv6, &icmpwinit 200 }; 201 202 /* Default structure copied into T_INFO_ACK messages */ 203 static struct T_info_ack icmp_g_t_info_ack = { 204 T_INFO_ACK, 205 IP_MAXPACKET, /* TSDU_size. icmp allows maximum size messages. */ 206 T_INVALID, /* ETSDU_size. icmp does not support expedited data. */ 207 T_INVALID, /* CDATA_size. icmp does not support connect data. */ 208 T_INVALID, /* DDATA_size. icmp does not support disconnect data. */ 209 0, /* ADDR_size - filled in later. */ 210 0, /* OPT_size - not initialized here */ 211 IP_MAXPACKET, /* TIDU_size. icmp allows maximum size messages. */ 212 T_CLTS, /* SERV_type. icmp supports connection-less. */ 213 TS_UNBND, /* CURRENT_state. This is set from icmp_state. */ 214 (XPG4_1|SENDZERO) /* PROVIDER_flag */ 215 }; 216 217 /* 218 * All of these are alterable, within the min/max values given, at run time. 219 * 220 * Note: All those tunables which do not start with "icmp_" are Committed and 221 * therefore are public. See PSARC 2009/306. 222 */ 223 static mod_prop_info_t icmp_propinfo_tbl[] = { 224 /* tunable - 0 */ 225 { "icmp_wroff_extra", MOD_PROTO_RAWIP, 226 mod_set_uint32, mod_get_uint32, 227 {0, 128, 32}, {32} }, 228 229 { "icmp_ipv4_ttl", MOD_PROTO_RAWIP, 230 mod_set_uint32, mod_get_uint32, 231 {1, 255, 255}, {255} }, 232 233 { "icmp_ipv6_hoplimit", MOD_PROTO_RAWIP, 234 mod_set_uint32, mod_get_uint32, 235 {0, IPV6_MAX_HOPS, IPV6_DEFAULT_HOPS}, 236 {IPV6_DEFAULT_HOPS} }, 237 238 { "icmp_bsd_compat", MOD_PROTO_RAWIP, 239 mod_set_boolean, mod_get_boolean, 240 {B_TRUE}, {B_TRUE} }, 241 242 { "send_maxbuf", MOD_PROTO_RAWIP, 243 mod_set_uint32, mod_get_uint32, 244 {4096, 65536, 8192}, {8192} }, 245 246 { "icmp_xmit_lowat", MOD_PROTO_RAWIP, 247 mod_set_uint32, mod_get_uint32, 248 {0, 65536, 1024}, {1024} }, 249 250 { "recv_maxbuf", MOD_PROTO_RAWIP, 251 mod_set_uint32, mod_get_uint32, 252 {4096, 65536, 8192}, {8192} }, 253 254 { "icmp_max_buf", MOD_PROTO_RAWIP, 255 mod_set_uint32, mod_get_uint32, 256 {65536, 1024*1024*1024, 256*1024}, {256 * 1024} }, 257 258 { "icmp_pmtu_discovery", MOD_PROTO_RAWIP, 259 mod_set_boolean, mod_get_boolean, 260 {B_FALSE}, {B_FALSE} }, 261 262 { "icmp_sendto_ignerr", MOD_PROTO_RAWIP, 263 mod_set_boolean, mod_get_boolean, 264 {B_FALSE}, {B_FALSE} }, 265 266 { "?", MOD_PROTO_RAWIP, NULL, mod_get_allprop, {0}, {0} }, 267 268 { NULL, 0, NULL, NULL, {0}, {0} } 269 }; 270 271 #define is_wroff_extra is_propinfo_tbl[0].prop_cur_uval 272 #define is_ipv4_ttl is_propinfo_tbl[1].prop_cur_uval 273 #define is_ipv6_hoplimit is_propinfo_tbl[2].prop_cur_uval 274 #define is_bsd_compat is_propinfo_tbl[3].prop_cur_bval 275 #define is_xmit_hiwat is_propinfo_tbl[4].prop_cur_uval 276 #define is_xmit_lowat is_propinfo_tbl[5].prop_cur_uval 277 #define is_recv_hiwat is_propinfo_tbl[6].prop_cur_uval 278 #define is_max_buf is_propinfo_tbl[7].prop_cur_uval 279 #define is_pmtu_discovery is_propinfo_tbl[8].prop_cur_bval 280 #define is_sendto_ignerr is_propinfo_tbl[9].prop_cur_bval 281 282 typedef union T_primitives *t_primp_t; 283 284 /* 285 * This routine is called to handle each O_T_BIND_REQ/T_BIND_REQ message 286 * passed to icmp_wput. 287 * It calls IP to verify the local IP address, and calls IP to insert 288 * the conn_t in the fanout table. 289 * If everything is ok it then sends the T_BIND_ACK back up. 290 */ 291 static void 292 icmp_tpi_bind(queue_t *q, mblk_t *mp) 293 { 294 int error; 295 struct sockaddr *sa; 296 struct T_bind_req *tbr; 297 socklen_t len; 298 sin_t *sin; 299 sin6_t *sin6; 300 icmp_t *icmp; 301 conn_t *connp = Q_TO_CONN(q); 302 mblk_t *mp1; 303 cred_t *cr; 304 305 /* 306 * All Solaris components should pass a db_credp 307 * for this TPI message, hence we ASSERT. 308 * But in case there is some other M_PROTO that looks 309 * like a TPI message sent by some other kernel 310 * component, we check and return an error. 311 */ 312 cr = msg_getcred(mp, NULL); 313 ASSERT(cr != NULL); 314 if (cr == NULL) { 315 icmp_err_ack(q, mp, TSYSERR, EINVAL); 316 return; 317 } 318 319 icmp = connp->conn_icmp; 320 if ((mp->b_wptr - mp->b_rptr) < sizeof (*tbr)) { 321 (void) mi_strlog(q, 1, SL_ERROR|SL_TRACE, 322 "icmp_bind: bad req, len %u", 323 (uint_t)(mp->b_wptr - mp->b_rptr)); 324 icmp_err_ack(q, mp, TPROTO, 0); 325 return; 326 } 327 328 if (icmp->icmp_state != TS_UNBND) { 329 (void) mi_strlog(q, 1, SL_ERROR|SL_TRACE, 330 "icmp_bind: bad state, %u", icmp->icmp_state); 331 icmp_err_ack(q, mp, TOUTSTATE, 0); 332 return; 333 } 334 335 /* 336 * Reallocate the message to make sure we have enough room for an 337 * address. 338 */ 339 mp1 = reallocb(mp, sizeof (struct T_bind_ack) + sizeof (sin6_t), 1); 340 if (mp1 == NULL) { 341 icmp_err_ack(q, mp, TSYSERR, ENOMEM); 342 return; 343 } 344 mp = mp1; 345 346 /* Reset the message type in preparation for shipping it back. */ 347 DB_TYPE(mp) = M_PCPROTO; 348 tbr = (struct T_bind_req *)mp->b_rptr; 349 len = tbr->ADDR_length; 350 switch (len) { 351 case 0: /* request for a generic port */ 352 tbr->ADDR_offset = sizeof (struct T_bind_req); 353 if (connp->conn_family == AF_INET) { 354 tbr->ADDR_length = sizeof (sin_t); 355 sin = (sin_t *)&tbr[1]; 356 *sin = sin_null; 357 sin->sin_family = AF_INET; 358 mp->b_wptr = (uchar_t *)&sin[1]; 359 sa = (struct sockaddr *)sin; 360 len = sizeof (sin_t); 361 } else { 362 ASSERT(connp->conn_family == AF_INET6); 363 tbr->ADDR_length = sizeof (sin6_t); 364 sin6 = (sin6_t *)&tbr[1]; 365 *sin6 = sin6_null; 366 sin6->sin6_family = AF_INET6; 367 mp->b_wptr = (uchar_t *)&sin6[1]; 368 sa = (struct sockaddr *)sin6; 369 len = sizeof (sin6_t); 370 } 371 break; 372 373 case sizeof (sin_t): /* Complete IPv4 address */ 374 sa = (struct sockaddr *)mi_offset_param(mp, tbr->ADDR_offset, 375 sizeof (sin_t)); 376 break; 377 378 case sizeof (sin6_t): /* Complete IPv6 address */ 379 sa = (struct sockaddr *)mi_offset_param(mp, 380 tbr->ADDR_offset, sizeof (sin6_t)); 381 break; 382 383 default: 384 (void) mi_strlog(q, 1, SL_ERROR|SL_TRACE, 385 "icmp_bind: bad ADDR_length %u", tbr->ADDR_length); 386 icmp_err_ack(q, mp, TBADADDR, 0); 387 return; 388 } 389 390 error = rawip_do_bind(connp, sa, len); 391 if (error != 0) { 392 if (error > 0) { 393 icmp_err_ack(q, mp, TSYSERR, error); 394 } else { 395 icmp_err_ack(q, mp, -error, 0); 396 } 397 } else { 398 tbr->PRIM_type = T_BIND_ACK; 399 qreply(q, mp); 400 } 401 } 402 403 static int 404 rawip_do_bind(conn_t *connp, struct sockaddr *sa, socklen_t len) 405 { 406 sin_t *sin; 407 sin6_t *sin6; 408 icmp_t *icmp = connp->conn_icmp; 409 int error = 0; 410 ip_laddr_t laddr_type = IPVL_UNICAST_UP; /* INADDR_ANY */ 411 in_port_t lport; /* Network byte order */ 412 ipaddr_t v4src; /* Set if AF_INET */ 413 in6_addr_t v6src; 414 uint_t scopeid = 0; 415 zoneid_t zoneid = IPCL_ZONEID(connp); 416 ip_stack_t *ipst = connp->conn_netstack->netstack_ip; 417 418 if (sa == NULL || !OK_32PTR((char *)sa)) { 419 return (EINVAL); 420 } 421 422 switch (len) { 423 case sizeof (sin_t): /* Complete IPv4 address */ 424 sin = (sin_t *)sa; 425 if (sin->sin_family != AF_INET || 426 connp->conn_family != AF_INET) { 427 /* TSYSERR, EAFNOSUPPORT */ 428 return (EAFNOSUPPORT); 429 } 430 v4src = sin->sin_addr.s_addr; 431 IN6_IPADDR_TO_V4MAPPED(v4src, &v6src); 432 if (v4src != INADDR_ANY) { 433 laddr_type = ip_laddr_verify_v4(v4src, zoneid, ipst, 434 B_TRUE); 435 } 436 lport = sin->sin_port; 437 break; 438 case sizeof (sin6_t): /* Complete IPv6 address */ 439 sin6 = (sin6_t *)sa; 440 if (sin6->sin6_family != AF_INET6 || 441 connp->conn_family != AF_INET6) { 442 /* TSYSERR, EAFNOSUPPORT */ 443 return (EAFNOSUPPORT); 444 } 445 /* No support for mapped addresses on raw sockets */ 446 if (IN6_IS_ADDR_V4MAPPED(&sin6->sin6_addr)) { 447 /* TSYSERR, EADDRNOTAVAIL */ 448 return (EADDRNOTAVAIL); 449 } 450 v6src = sin6->sin6_addr; 451 if (!IN6_IS_ADDR_UNSPECIFIED(&v6src)) { 452 if (IN6_IS_ADDR_LINKSCOPE(&v6src)) 453 scopeid = sin6->sin6_scope_id; 454 laddr_type = ip_laddr_verify_v6(&v6src, zoneid, ipst, 455 B_TRUE, scopeid); 456 } 457 lport = sin6->sin6_port; 458 break; 459 460 default: 461 /* TBADADDR */ 462 return (EADDRNOTAVAIL); 463 } 464 465 /* Is the local address a valid unicast, multicast, or broadcast? */ 466 if (laddr_type == IPVL_BAD) 467 return (EADDRNOTAVAIL); 468 469 /* 470 * The state must be TS_UNBND. 471 */ 472 mutex_enter(&connp->conn_lock); 473 if (icmp->icmp_state != TS_UNBND) { 474 mutex_exit(&connp->conn_lock); 475 return (-TOUTSTATE); 476 } 477 478 /* 479 * Copy the source address into our icmp structure. This address 480 * may still be zero; if so, ip will fill in the correct address 481 * each time an outbound packet is passed to it. 482 * If we are binding to a broadcast or multicast address then 483 * we just set the conn_bound_addr since we don't want to use 484 * that as the source address when sending. 485 */ 486 connp->conn_bound_addr_v6 = v6src; 487 connp->conn_laddr_v6 = v6src; 488 if (scopeid != 0) { 489 connp->conn_ixa->ixa_flags |= IXAF_SCOPEID_SET; 490 connp->conn_ixa->ixa_scopeid = scopeid; 491 connp->conn_incoming_ifindex = scopeid; 492 } else { 493 connp->conn_ixa->ixa_flags &= ~IXAF_SCOPEID_SET; 494 connp->conn_incoming_ifindex = connp->conn_bound_if; 495 } 496 497 switch (laddr_type) { 498 case IPVL_UNICAST_UP: 499 case IPVL_UNICAST_DOWN: 500 connp->conn_saddr_v6 = v6src; 501 connp->conn_mcbc_bind = B_FALSE; 502 break; 503 case IPVL_MCAST: 504 case IPVL_BCAST: 505 /* ip_set_destination will pick a source address later */ 506 connp->conn_saddr_v6 = ipv6_all_zeros; 507 connp->conn_mcbc_bind = B_TRUE; 508 break; 509 } 510 511 /* Any errors after this point should use late_error */ 512 513 /* 514 * Use sin_port/sin6_port since applications like psh use SOCK_RAW 515 * with IPPROTO_TCP. 516 */ 517 connp->conn_lport = lport; 518 connp->conn_fport = 0; 519 520 if (connp->conn_family == AF_INET) { 521 ASSERT(connp->conn_ipversion == IPV4_VERSION); 522 } else { 523 ASSERT(connp->conn_ipversion == IPV6_VERSION); 524 } 525 526 icmp->icmp_state = TS_IDLE; 527 528 /* 529 * We create an initial header template here to make a subsequent 530 * sendto have a starting point. Since conn_last_dst is zero the 531 * first sendto will always follow the 'dst changed' code path. 532 * Note that we defer massaging options and the related checksum 533 * adjustment until we have a destination address. 534 */ 535 error = icmp_build_hdr_template(connp, &connp->conn_saddr_v6, 536 &connp->conn_faddr_v6, connp->conn_flowinfo); 537 if (error != 0) { 538 mutex_exit(&connp->conn_lock); 539 goto late_error; 540 } 541 /* Just in case */ 542 connp->conn_faddr_v6 = ipv6_all_zeros; 543 connp->conn_v6lastdst = ipv6_all_zeros; 544 mutex_exit(&connp->conn_lock); 545 546 error = ip_laddr_fanout_insert(connp); 547 if (error != 0) 548 goto late_error; 549 550 /* Bind succeeded */ 551 return (0); 552 553 late_error: 554 mutex_enter(&connp->conn_lock); 555 connp->conn_saddr_v6 = ipv6_all_zeros; 556 connp->conn_bound_addr_v6 = ipv6_all_zeros; 557 connp->conn_laddr_v6 = ipv6_all_zeros; 558 if (scopeid != 0) { 559 connp->conn_ixa->ixa_flags &= ~IXAF_SCOPEID_SET; 560 connp->conn_incoming_ifindex = connp->conn_bound_if; 561 } 562 icmp->icmp_state = TS_UNBND; 563 connp->conn_v6lastdst = ipv6_all_zeros; 564 connp->conn_lport = 0; 565 566 /* Restore the header that was built above - different source address */ 567 (void) icmp_build_hdr_template(connp, &connp->conn_saddr_v6, 568 &connp->conn_faddr_v6, connp->conn_flowinfo); 569 mutex_exit(&connp->conn_lock); 570 return (error); 571 } 572 573 /* 574 * Tell IP to just bind to the protocol. 575 */ 576 static void 577 icmp_bind_proto(icmp_t *icmp) 578 { 579 conn_t *connp = icmp->icmp_connp; 580 581 mutex_enter(&connp->conn_lock); 582 connp->conn_saddr_v6 = ipv6_all_zeros; 583 connp->conn_laddr_v6 = ipv6_all_zeros; 584 connp->conn_faddr_v6 = ipv6_all_zeros; 585 connp->conn_v6lastdst = ipv6_all_zeros; 586 mutex_exit(&connp->conn_lock); 587 588 (void) ip_laddr_fanout_insert(connp); 589 } 590 591 /* 592 * This routine handles each T_CONN_REQ message passed to icmp. It 593 * associates a default destination address with the stream. 594 * 595 * After various error checks are completed, icmp_connect() lays 596 * the target address and port into the composite header template. 597 * Then we ask IP for information, including a source address if we didn't 598 * already have one. Finally we send up the T_OK_ACK reply message. 599 */ 600 static void 601 icmp_tpi_connect(queue_t *q, mblk_t *mp) 602 { 603 conn_t *connp = Q_TO_CONN(q); 604 struct T_conn_req *tcr; 605 struct sockaddr *sa; 606 socklen_t len; 607 int error; 608 cred_t *cr; 609 pid_t pid; 610 /* 611 * All Solaris components should pass a db_credp 612 * for this TPI message, hence we ASSERT. 613 * But in case there is some other M_PROTO that looks 614 * like a TPI message sent by some other kernel 615 * component, we check and return an error. 616 */ 617 cr = msg_getcred(mp, &pid); 618 ASSERT(cr != NULL); 619 if (cr == NULL) { 620 icmp_err_ack(q, mp, TSYSERR, EINVAL); 621 return; 622 } 623 624 tcr = (struct T_conn_req *)mp->b_rptr; 625 /* Sanity checks */ 626 if ((mp->b_wptr - mp->b_rptr) < sizeof (struct T_conn_req)) { 627 icmp_err_ack(q, mp, TPROTO, 0); 628 return; 629 } 630 631 if (tcr->OPT_length != 0) { 632 icmp_err_ack(q, mp, TBADOPT, 0); 633 return; 634 } 635 636 len = tcr->DEST_length; 637 638 switch (len) { 639 default: 640 icmp_err_ack(q, mp, TBADADDR, 0); 641 return; 642 case sizeof (sin_t): 643 sa = (struct sockaddr *)mi_offset_param(mp, tcr->DEST_offset, 644 sizeof (sin_t)); 645 break; 646 case sizeof (sin6_t): 647 sa = (struct sockaddr *)mi_offset_param(mp, 648 tcr->DEST_offset, sizeof (sin6_t)); 649 break; 650 } 651 652 error = proto_verify_ip_addr(connp->conn_family, sa, len); 653 if (error != 0) { 654 icmp_err_ack(q, mp, TSYSERR, error); 655 return; 656 } 657 658 error = rawip_do_connect(connp, sa, len, cr, pid); 659 if (error != 0) { 660 if (error < 0) { 661 icmp_err_ack(q, mp, -error, 0); 662 } else { 663 icmp_err_ack(q, mp, 0, error); 664 } 665 } else { 666 mblk_t *mp1; 667 668 /* 669 * We have to send a connection confirmation to 670 * keep TLI happy. 671 */ 672 if (connp->conn_family == AF_INET) { 673 mp1 = mi_tpi_conn_con(NULL, (char *)sa, 674 sizeof (sin_t), NULL, 0); 675 } else { 676 ASSERT(connp->conn_family == AF_INET6); 677 mp1 = mi_tpi_conn_con(NULL, (char *)sa, 678 sizeof (sin6_t), NULL, 0); 679 } 680 if (mp1 == NULL) { 681 icmp_err_ack(q, mp, TSYSERR, ENOMEM); 682 return; 683 } 684 685 /* 686 * Send ok_ack for T_CONN_REQ 687 */ 688 mp = mi_tpi_ok_ack_alloc(mp); 689 if (mp == NULL) { 690 /* Unable to reuse the T_CONN_REQ for the ack. */ 691 icmp_err_ack_prim(q, mp1, T_CONN_REQ, TSYSERR, ENOMEM); 692 return; 693 } 694 putnext(connp->conn_rq, mp); 695 putnext(connp->conn_rq, mp1); 696 } 697 } 698 699 static int 700 rawip_do_connect(conn_t *connp, const struct sockaddr *sa, socklen_t len, 701 cred_t *cr, pid_t pid) 702 { 703 icmp_t *icmp; 704 sin_t *sin; 705 sin6_t *sin6; 706 int error; 707 uint16_t dstport; 708 ipaddr_t v4dst; 709 in6_addr_t v6dst; 710 uint32_t flowinfo; 711 ip_xmit_attr_t *ixa; 712 uint_t scopeid = 0; 713 uint_t srcid = 0; 714 in6_addr_t v6src = connp->conn_saddr_v6; 715 716 icmp = connp->conn_icmp; 717 718 if (sa == NULL || !OK_32PTR((char *)sa)) { 719 return (EINVAL); 720 } 721 722 ASSERT(sa != NULL && len != 0); 723 724 /* 725 * Determine packet type based on type of address passed in 726 * the request should contain an IPv4 or IPv6 address. 727 * Make sure that address family matches the type of 728 * family of the address passed down. 729 */ 730 switch (len) { 731 case sizeof (sin_t): 732 sin = (sin_t *)sa; 733 734 v4dst = sin->sin_addr.s_addr; 735 dstport = sin->sin_port; 736 IN6_IPADDR_TO_V4MAPPED(v4dst, &v6dst); 737 ASSERT(connp->conn_ipversion == IPV4_VERSION); 738 break; 739 740 case sizeof (sin6_t): 741 sin6 = (sin6_t *)sa; 742 743 /* No support for mapped addresses on raw sockets */ 744 if (IN6_IS_ADDR_V4MAPPED(&sin6->sin6_addr)) { 745 return (EADDRNOTAVAIL); 746 } 747 v6dst = sin6->sin6_addr; 748 dstport = sin6->sin6_port; 749 ASSERT(connp->conn_ipversion == IPV6_VERSION); 750 flowinfo = sin6->sin6_flowinfo; 751 if (IN6_IS_ADDR_LINKLOCAL(&sin6->sin6_addr)) 752 scopeid = sin6->sin6_scope_id; 753 srcid = sin6->__sin6_src_id; 754 if (srcid != 0 && IN6_IS_ADDR_UNSPECIFIED(&v6src)) { 755 ip_srcid_find_id(srcid, &v6src, IPCL_ZONEID(connp), 756 connp->conn_netstack); 757 } 758 break; 759 } 760 761 /* 762 * If there is a different thread using conn_ixa then we get a new 763 * copy and cut the old one loose from conn_ixa. Otherwise we use 764 * conn_ixa and prevent any other thread from using/changing it. 765 * Once connect() is done other threads can use conn_ixa since the 766 * refcnt will be back at one. 767 */ 768 ixa = conn_get_ixa(connp, B_TRUE); 769 if (ixa == NULL) 770 return (ENOMEM); 771 772 ASSERT(ixa->ixa_refcnt >= 2); 773 ASSERT(ixa == connp->conn_ixa); 774 775 mutex_enter(&connp->conn_lock); 776 /* 777 * This icmp_t must have bound already before doing a connect. 778 * Reject if a connect is in progress (we drop conn_lock during 779 * rawip_do_connect). 780 */ 781 if (icmp->icmp_state == TS_UNBND || icmp->icmp_state == TS_WCON_CREQ) { 782 mutex_exit(&connp->conn_lock); 783 ixa_refrele(ixa); 784 return (-TOUTSTATE); 785 } 786 787 if (icmp->icmp_state == TS_DATA_XFER) { 788 /* Already connected - clear out state */ 789 if (connp->conn_mcbc_bind) 790 connp->conn_saddr_v6 = ipv6_all_zeros; 791 else 792 connp->conn_saddr_v6 = connp->conn_bound_addr_v6; 793 connp->conn_laddr_v6 = connp->conn_bound_addr_v6; 794 connp->conn_faddr_v6 = ipv6_all_zeros; 795 icmp->icmp_state = TS_IDLE; 796 } 797 798 /* 799 * Use sin_port/sin6_port since applications like psh use SOCK_RAW 800 * with IPPROTO_TCP. 801 */ 802 connp->conn_fport = dstport; 803 if (connp->conn_ipversion == IPV4_VERSION) { 804 /* 805 * Interpret a zero destination to mean loopback. 806 * Update the T_CONN_REQ (sin/sin6) since it is used to 807 * generate the T_CONN_CON. 808 */ 809 if (v4dst == INADDR_ANY) { 810 v4dst = htonl(INADDR_LOOPBACK); 811 IN6_IPADDR_TO_V4MAPPED(v4dst, &v6dst); 812 ASSERT(connp->conn_family == AF_INET); 813 sin->sin_addr.s_addr = v4dst; 814 } 815 connp->conn_faddr_v6 = v6dst; 816 connp->conn_flowinfo = 0; 817 } else { 818 ASSERT(connp->conn_ipversion == IPV6_VERSION); 819 /* 820 * Interpret a zero destination to mean loopback. 821 * Update the T_CONN_REQ (sin/sin6) since it is used to 822 * generate the T_CONN_CON. 823 */ 824 if (IN6_IS_ADDR_UNSPECIFIED(&v6dst)) { 825 v6dst = ipv6_loopback; 826 sin6->sin6_addr = v6dst; 827 } 828 connp->conn_faddr_v6 = v6dst; 829 connp->conn_flowinfo = flowinfo; 830 } 831 832 /* 833 * We update our cred/cpid based on the caller of connect 834 */ 835 if (connp->conn_cred != cr) { 836 crhold(cr); 837 crfree(connp->conn_cred); 838 connp->conn_cred = cr; 839 } 840 connp->conn_cpid = pid; 841 ASSERT(!(ixa->ixa_free_flags & IXA_FREE_CRED)); 842 ixa->ixa_cred = cr; 843 ixa->ixa_cpid = pid; 844 if (is_system_labeled()) { 845 /* We need to restart with a label based on the cred */ 846 ip_xmit_attr_restore_tsl(ixa, ixa->ixa_cred); 847 } 848 849 if (scopeid != 0) { 850 ixa->ixa_flags |= IXAF_SCOPEID_SET; 851 ixa->ixa_scopeid = scopeid; 852 connp->conn_incoming_ifindex = scopeid; 853 } else { 854 ixa->ixa_flags &= ~IXAF_SCOPEID_SET; 855 connp->conn_incoming_ifindex = connp->conn_bound_if; 856 } 857 858 /* 859 * conn_connect will drop conn_lock and reacquire it. 860 * To prevent a send* from messing with this icmp_t while the lock 861 * is dropped we set icmp_state and clear conn_v6lastdst. 862 * That will make all send* fail with EISCONN. 863 */ 864 connp->conn_v6lastdst = ipv6_all_zeros; 865 icmp->icmp_state = TS_WCON_CREQ; 866 867 error = conn_connect(connp, NULL, IPDF_ALLOW_MCBC); 868 mutex_exit(&connp->conn_lock); 869 if (error != 0) 870 goto connect_failed; 871 872 /* 873 * The addresses have been verified. Time to insert in 874 * the correct fanout list. 875 */ 876 error = ipcl_conn_insert(connp); 877 if (error != 0) 878 goto connect_failed; 879 880 mutex_enter(&connp->conn_lock); 881 error = icmp_build_hdr_template(connp, &connp->conn_saddr_v6, 882 &connp->conn_faddr_v6, connp->conn_flowinfo); 883 if (error != 0) { 884 mutex_exit(&connp->conn_lock); 885 goto connect_failed; 886 } 887 888 icmp->icmp_state = TS_DATA_XFER; 889 /* Record this as the "last" send even though we haven't sent any */ 890 connp->conn_v6lastdst = connp->conn_faddr_v6; 891 connp->conn_lastipversion = connp->conn_ipversion; 892 connp->conn_lastdstport = connp->conn_fport; 893 connp->conn_lastflowinfo = connp->conn_flowinfo; 894 connp->conn_lastscopeid = scopeid; 895 connp->conn_lastsrcid = srcid; 896 /* Also remember a source to use together with lastdst */ 897 connp->conn_v6lastsrc = v6src; 898 mutex_exit(&connp->conn_lock); 899 900 ixa_refrele(ixa); 901 return (0); 902 903 connect_failed: 904 if (ixa != NULL) 905 ixa_refrele(ixa); 906 mutex_enter(&connp->conn_lock); 907 icmp->icmp_state = TS_IDLE; 908 /* In case the source address was set above */ 909 if (connp->conn_mcbc_bind) 910 connp->conn_saddr_v6 = ipv6_all_zeros; 911 else 912 connp->conn_saddr_v6 = connp->conn_bound_addr_v6; 913 connp->conn_laddr_v6 = connp->conn_bound_addr_v6; 914 connp->conn_faddr_v6 = ipv6_all_zeros; 915 connp->conn_v6lastdst = ipv6_all_zeros; 916 connp->conn_flowinfo = 0; 917 918 (void) icmp_build_hdr_template(connp, &connp->conn_saddr_v6, 919 &connp->conn_faddr_v6, connp->conn_flowinfo); 920 mutex_exit(&connp->conn_lock); 921 return (error); 922 } 923 924 static void 925 rawip_do_close(conn_t *connp) 926 { 927 ASSERT(connp != NULL && IPCL_IS_RAWIP(connp)); 928 929 ip_quiesce_conn(connp); 930 931 if (!IPCL_IS_NONSTR(connp)) { 932 qprocsoff(connp->conn_rq); 933 } 934 935 icmp_close_free(connp); 936 937 /* 938 * Now we are truly single threaded on this stream, and can 939 * delete the things hanging off the connp, and finally the connp. 940 * We removed this connp from the fanout list, it cannot be 941 * accessed thru the fanouts, and we already waited for the 942 * conn_ref to drop to 0. We are already in close, so 943 * there cannot be any other thread from the top. qprocsoff 944 * has completed, and service has completed or won't run in 945 * future. 946 */ 947 ASSERT(connp->conn_ref == 1); 948 949 if (!IPCL_IS_NONSTR(connp)) { 950 inet_minor_free(connp->conn_minor_arena, connp->conn_dev); 951 } else { 952 ip_free_helper_stream(connp); 953 } 954 955 connp->conn_ref--; 956 ipcl_conn_destroy(connp); 957 } 958 959 static int 960 icmp_close(queue_t *q, int flags) 961 { 962 conn_t *connp; 963 964 if (flags & SO_FALLBACK) { 965 /* 966 * stream is being closed while in fallback 967 * simply free the resources that were allocated 968 */ 969 inet_minor_free(WR(q)->q_ptr, (dev_t)(RD(q)->q_ptr)); 970 qprocsoff(q); 971 goto done; 972 } 973 974 connp = Q_TO_CONN(q); 975 (void) rawip_do_close(connp); 976 done: 977 q->q_ptr = WR(q)->q_ptr = NULL; 978 return (0); 979 } 980 981 static void 982 icmp_close_free(conn_t *connp) 983 { 984 icmp_t *icmp = connp->conn_icmp; 985 986 if (icmp->icmp_filter != NULL) { 987 kmem_free(icmp->icmp_filter, sizeof (icmp6_filter_t)); 988 icmp->icmp_filter = NULL; 989 } 990 991 /* 992 * Clear any fields which the kmem_cache constructor clears. 993 * Only icmp_connp needs to be preserved. 994 * TBD: We should make this more efficient to avoid clearing 995 * everything. 996 */ 997 ASSERT(icmp->icmp_connp == connp); 998 bzero(icmp, sizeof (icmp_t)); 999 icmp->icmp_connp = connp; 1000 } 1001 1002 /* 1003 * This routine handles each T_DISCON_REQ message passed to icmp 1004 * as an indicating that ICMP is no longer connected. This results 1005 * in telling IP to restore the binding to just the local address. 1006 */ 1007 static int 1008 icmp_do_disconnect(conn_t *connp) 1009 { 1010 icmp_t *icmp = connp->conn_icmp; 1011 int error; 1012 1013 mutex_enter(&connp->conn_lock); 1014 if (icmp->icmp_state != TS_DATA_XFER) { 1015 mutex_exit(&connp->conn_lock); 1016 return (-TOUTSTATE); 1017 } 1018 if (connp->conn_mcbc_bind) 1019 connp->conn_saddr_v6 = ipv6_all_zeros; 1020 else 1021 connp->conn_saddr_v6 = connp->conn_bound_addr_v6; 1022 connp->conn_laddr_v6 = connp->conn_bound_addr_v6; 1023 connp->conn_faddr_v6 = ipv6_all_zeros; 1024 icmp->icmp_state = TS_IDLE; 1025 1026 connp->conn_v6lastdst = ipv6_all_zeros; 1027 error = icmp_build_hdr_template(connp, &connp->conn_saddr_v6, 1028 &connp->conn_faddr_v6, connp->conn_flowinfo); 1029 mutex_exit(&connp->conn_lock); 1030 if (error != 0) 1031 return (error); 1032 1033 /* 1034 * Tell IP to remove the full binding and revert 1035 * to the local address binding. 1036 */ 1037 return (ip_laddr_fanout_insert(connp)); 1038 } 1039 1040 static void 1041 icmp_tpi_disconnect(queue_t *q, mblk_t *mp) 1042 { 1043 conn_t *connp = Q_TO_CONN(q); 1044 int error; 1045 1046 /* 1047 * Allocate the largest primitive we need to send back 1048 * T_error_ack is > than T_ok_ack 1049 */ 1050 mp = reallocb(mp, sizeof (struct T_error_ack), 1); 1051 if (mp == NULL) { 1052 /* Unable to reuse the T_DISCON_REQ for the ack. */ 1053 icmp_err_ack_prim(q, mp, T_DISCON_REQ, TSYSERR, ENOMEM); 1054 return; 1055 } 1056 1057 error = icmp_do_disconnect(connp); 1058 1059 if (error != 0) { 1060 if (error > 0) { 1061 icmp_err_ack(q, mp, 0, error); 1062 } else { 1063 icmp_err_ack(q, mp, -error, 0); 1064 } 1065 } else { 1066 mp = mi_tpi_ok_ack_alloc(mp); 1067 ASSERT(mp != NULL); 1068 qreply(q, mp); 1069 } 1070 } 1071 1072 static int 1073 icmp_disconnect(conn_t *connp) 1074 { 1075 int error; 1076 1077 connp->conn_dgram_errind = B_FALSE; 1078 1079 error = icmp_do_disconnect(connp); 1080 1081 if (error < 0) 1082 error = proto_tlitosyserr(-error); 1083 return (error); 1084 } 1085 1086 /* This routine creates a T_ERROR_ACK message and passes it upstream. */ 1087 static void 1088 icmp_err_ack(queue_t *q, mblk_t *mp, t_scalar_t t_error, int sys_error) 1089 { 1090 if ((mp = mi_tpi_err_ack_alloc(mp, t_error, sys_error)) != NULL) 1091 qreply(q, mp); 1092 } 1093 1094 /* Shorthand to generate and send TPI error acks to our client */ 1095 static void 1096 icmp_err_ack_prim(queue_t *q, mblk_t *mp, t_scalar_t primitive, 1097 t_scalar_t t_error, int sys_error) 1098 { 1099 struct T_error_ack *teackp; 1100 1101 if ((mp = tpi_ack_alloc(mp, sizeof (struct T_error_ack), 1102 M_PCPROTO, T_ERROR_ACK)) != NULL) { 1103 teackp = (struct T_error_ack *)mp->b_rptr; 1104 teackp->ERROR_prim = primitive; 1105 teackp->TLI_error = t_error; 1106 teackp->UNIX_error = sys_error; 1107 qreply(q, mp); 1108 } 1109 } 1110 1111 /* 1112 * icmp_icmp_input is called as conn_recvicmp to process ICMP messages. 1113 * Generates the appropriate T_UDERROR_IND for permanent (non-transient) errors. 1114 * Assumes that IP has pulled up everything up to and including the ICMP header. 1115 */ 1116 /* ARGSUSED2 */ 1117 static void 1118 icmp_icmp_input(void *arg1, mblk_t *mp, void *arg2, ip_recv_attr_t *ira) 1119 { 1120 conn_t *connp = (conn_t *)arg1; 1121 icmp_t *icmp = connp->conn_icmp; 1122 icmph_t *icmph; 1123 ipha_t *ipha; 1124 int iph_hdr_length; 1125 sin_t sin; 1126 mblk_t *mp1; 1127 int error = 0; 1128 1129 ipha = (ipha_t *)mp->b_rptr; 1130 1131 ASSERT(OK_32PTR(mp->b_rptr)); 1132 1133 if (IPH_HDR_VERSION(ipha) != IPV4_VERSION) { 1134 ASSERT(IPH_HDR_VERSION(ipha) == IPV6_VERSION); 1135 icmp_icmp_error_ipv6(connp, mp, ira); 1136 return; 1137 } 1138 ASSERT(IPH_HDR_VERSION(ipha) == IPV4_VERSION); 1139 1140 /* Skip past the outer IP and ICMP headers */ 1141 ASSERT(IPH_HDR_LENGTH(ipha) == ira->ira_ip_hdr_length); 1142 iph_hdr_length = ira->ira_ip_hdr_length; 1143 icmph = (icmph_t *)&mp->b_rptr[iph_hdr_length]; 1144 ipha = (ipha_t *)&icmph[1]; /* Inner IP header */ 1145 1146 iph_hdr_length = IPH_HDR_LENGTH(ipha); 1147 1148 switch (icmph->icmph_type) { 1149 case ICMP_DEST_UNREACHABLE: 1150 switch (icmph->icmph_code) { 1151 case ICMP_FRAGMENTATION_NEEDED: { 1152 ipha_t *ipha; 1153 ip_xmit_attr_t *ixa; 1154 /* 1155 * IP has already adjusted the path MTU. 1156 * But we need to adjust DF for IPv4. 1157 */ 1158 if (connp->conn_ipversion != IPV4_VERSION) 1159 break; 1160 1161 ixa = conn_get_ixa(connp, B_FALSE); 1162 if (ixa == NULL || ixa->ixa_ire == NULL) { 1163 /* 1164 * Some other thread holds conn_ixa. We will 1165 * redo this on the next ICMP too big. 1166 */ 1167 if (ixa != NULL) 1168 ixa_refrele(ixa); 1169 break; 1170 } 1171 (void) ip_get_pmtu(ixa); 1172 1173 mutex_enter(&connp->conn_lock); 1174 ipha = (ipha_t *)connp->conn_ht_iphc; 1175 if (ixa->ixa_flags & IXAF_PMTU_IPV4_DF) { 1176 ipha->ipha_fragment_offset_and_flags |= 1177 IPH_DF_HTONS; 1178 } else { 1179 ipha->ipha_fragment_offset_and_flags &= 1180 ~IPH_DF_HTONS; 1181 } 1182 mutex_exit(&connp->conn_lock); 1183 ixa_refrele(ixa); 1184 break; 1185 } 1186 case ICMP_PORT_UNREACHABLE: 1187 case ICMP_PROTOCOL_UNREACHABLE: 1188 error = ECONNREFUSED; 1189 break; 1190 default: 1191 /* Transient errors */ 1192 break; 1193 } 1194 break; 1195 default: 1196 /* Transient errors */ 1197 break; 1198 } 1199 if (error == 0) { 1200 freemsg(mp); 1201 return; 1202 } 1203 1204 /* 1205 * Deliver T_UDERROR_IND when the application has asked for it. 1206 * The socket layer enables this automatically when connected. 1207 */ 1208 if (!connp->conn_dgram_errind) { 1209 freemsg(mp); 1210 return; 1211 } 1212 1213 sin = sin_null; 1214 sin.sin_family = AF_INET; 1215 sin.sin_addr.s_addr = ipha->ipha_dst; 1216 1217 if (IPCL_IS_NONSTR(connp)) { 1218 mutex_enter(&connp->conn_lock); 1219 if (icmp->icmp_state == TS_DATA_XFER) { 1220 if (sin.sin_addr.s_addr == connp->conn_faddr_v4) { 1221 mutex_exit(&connp->conn_lock); 1222 (*connp->conn_upcalls->su_set_error) 1223 (connp->conn_upper_handle, error); 1224 goto done; 1225 } 1226 } else { 1227 icmp->icmp_delayed_error = error; 1228 *((sin_t *)&icmp->icmp_delayed_addr) = sin; 1229 } 1230 mutex_exit(&connp->conn_lock); 1231 } else { 1232 mp1 = mi_tpi_uderror_ind((char *)&sin, sizeof (sin_t), NULL, 0, 1233 error); 1234 if (mp1 != NULL) 1235 putnext(connp->conn_rq, mp1); 1236 } 1237 done: 1238 freemsg(mp); 1239 } 1240 1241 /* 1242 * icmp_icmp_error_ipv6 is called by icmp_icmp_error to process ICMP for IPv6. 1243 * Generates the appropriate T_UDERROR_IND for permanent (non-transient) errors. 1244 * Assumes that IP has pulled up all the extension headers as well as the 1245 * ICMPv6 header. 1246 */ 1247 static void 1248 icmp_icmp_error_ipv6(conn_t *connp, mblk_t *mp, ip_recv_attr_t *ira) 1249 { 1250 icmp6_t *icmp6; 1251 ip6_t *ip6h, *outer_ip6h; 1252 uint16_t iph_hdr_length; 1253 uint8_t *nexthdrp; 1254 sin6_t sin6; 1255 mblk_t *mp1; 1256 int error = 0; 1257 icmp_t *icmp = connp->conn_icmp; 1258 1259 outer_ip6h = (ip6_t *)mp->b_rptr; 1260 #ifdef DEBUG 1261 if (outer_ip6h->ip6_nxt != IPPROTO_ICMPV6) 1262 iph_hdr_length = ip_hdr_length_v6(mp, outer_ip6h); 1263 else 1264 iph_hdr_length = IPV6_HDR_LEN; 1265 ASSERT(iph_hdr_length == ira->ira_ip_hdr_length); 1266 #endif 1267 /* Skip past the outer IP and ICMP headers */ 1268 iph_hdr_length = ira->ira_ip_hdr_length; 1269 icmp6 = (icmp6_t *)&mp->b_rptr[iph_hdr_length]; 1270 1271 ip6h = (ip6_t *)&icmp6[1]; /* Inner IP header */ 1272 if (!ip_hdr_length_nexthdr_v6(mp, ip6h, &iph_hdr_length, &nexthdrp)) { 1273 freemsg(mp); 1274 return; 1275 } 1276 1277 switch (icmp6->icmp6_type) { 1278 case ICMP6_DST_UNREACH: 1279 switch (icmp6->icmp6_code) { 1280 case ICMP6_DST_UNREACH_NOPORT: 1281 error = ECONNREFUSED; 1282 break; 1283 case ICMP6_DST_UNREACH_ADMIN: 1284 case ICMP6_DST_UNREACH_NOROUTE: 1285 case ICMP6_DST_UNREACH_BEYONDSCOPE: 1286 case ICMP6_DST_UNREACH_ADDR: 1287 /* Transient errors */ 1288 break; 1289 default: 1290 break; 1291 } 1292 break; 1293 case ICMP6_PACKET_TOO_BIG: { 1294 struct T_unitdata_ind *tudi; 1295 struct T_opthdr *toh; 1296 size_t udi_size; 1297 mblk_t *newmp; 1298 t_scalar_t opt_length = sizeof (struct T_opthdr) + 1299 sizeof (struct ip6_mtuinfo); 1300 sin6_t *sin6; 1301 struct ip6_mtuinfo *mtuinfo; 1302 1303 /* 1304 * If the application has requested to receive path mtu 1305 * information, send up an empty message containing an 1306 * IPV6_PATHMTU ancillary data item. 1307 */ 1308 if (!connp->conn_ipv6_recvpathmtu) 1309 break; 1310 1311 udi_size = sizeof (struct T_unitdata_ind) + sizeof (sin6_t) + 1312 opt_length; 1313 if ((newmp = allocb(udi_size, BPRI_MED)) == NULL) { 1314 BUMP_MIB(&icmp->icmp_is->is_rawip_mib, rawipInErrors); 1315 break; 1316 } 1317 1318 /* 1319 * newmp->b_cont is left to NULL on purpose. This is an 1320 * empty message containing only ancillary data. 1321 */ 1322 newmp->b_datap->db_type = M_PROTO; 1323 tudi = (struct T_unitdata_ind *)newmp->b_rptr; 1324 newmp->b_wptr = (uchar_t *)tudi + udi_size; 1325 tudi->PRIM_type = T_UNITDATA_IND; 1326 tudi->SRC_length = sizeof (sin6_t); 1327 tudi->SRC_offset = sizeof (struct T_unitdata_ind); 1328 tudi->OPT_offset = tudi->SRC_offset + sizeof (sin6_t); 1329 tudi->OPT_length = opt_length; 1330 1331 sin6 = (sin6_t *)&tudi[1]; 1332 bzero(sin6, sizeof (sin6_t)); 1333 sin6->sin6_family = AF_INET6; 1334 sin6->sin6_addr = connp->conn_faddr_v6; 1335 1336 toh = (struct T_opthdr *)&sin6[1]; 1337 toh->level = IPPROTO_IPV6; 1338 toh->name = IPV6_PATHMTU; 1339 toh->len = opt_length; 1340 toh->status = 0; 1341 1342 mtuinfo = (struct ip6_mtuinfo *)&toh[1]; 1343 bzero(mtuinfo, sizeof (struct ip6_mtuinfo)); 1344 mtuinfo->ip6m_addr.sin6_family = AF_INET6; 1345 mtuinfo->ip6m_addr.sin6_addr = ip6h->ip6_dst; 1346 mtuinfo->ip6m_mtu = icmp6->icmp6_mtu; 1347 /* 1348 * We've consumed everything we need from the original 1349 * message. Free it, then send our empty message. 1350 */ 1351 freemsg(mp); 1352 icmp_ulp_recv(connp, newmp, msgdsize(newmp)); 1353 return; 1354 } 1355 case ICMP6_TIME_EXCEEDED: 1356 /* Transient errors */ 1357 break; 1358 case ICMP6_PARAM_PROB: 1359 /* If this corresponds to an ICMP_PROTOCOL_UNREACHABLE */ 1360 if (icmp6->icmp6_code == ICMP6_PARAMPROB_NEXTHEADER && 1361 (uchar_t *)ip6h + icmp6->icmp6_pptr == 1362 (uchar_t *)nexthdrp) { 1363 error = ECONNREFUSED; 1364 break; 1365 } 1366 break; 1367 } 1368 if (error == 0) { 1369 freemsg(mp); 1370 return; 1371 } 1372 1373 /* 1374 * Deliver T_UDERROR_IND when the application has asked for it. 1375 * The socket layer enables this automatically when connected. 1376 */ 1377 if (!connp->conn_dgram_errind) { 1378 freemsg(mp); 1379 return; 1380 } 1381 1382 sin6 = sin6_null; 1383 sin6.sin6_family = AF_INET6; 1384 sin6.sin6_addr = ip6h->ip6_dst; 1385 sin6.sin6_flowinfo = ip6h->ip6_vcf & ~IPV6_VERS_AND_FLOW_MASK; 1386 if (IPCL_IS_NONSTR(connp)) { 1387 mutex_enter(&connp->conn_lock); 1388 if (icmp->icmp_state == TS_DATA_XFER) { 1389 if (IN6_ARE_ADDR_EQUAL(&sin6.sin6_addr, 1390 &connp->conn_faddr_v6)) { 1391 mutex_exit(&connp->conn_lock); 1392 (*connp->conn_upcalls->su_set_error) 1393 (connp->conn_upper_handle, error); 1394 goto done; 1395 } 1396 } else { 1397 icmp->icmp_delayed_error = error; 1398 *((sin6_t *)&icmp->icmp_delayed_addr) = sin6; 1399 } 1400 mutex_exit(&connp->conn_lock); 1401 } else { 1402 mp1 = mi_tpi_uderror_ind((char *)&sin6, sizeof (sin6_t), 1403 NULL, 0, error); 1404 if (mp1 != NULL) 1405 putnext(connp->conn_rq, mp1); 1406 } 1407 done: 1408 freemsg(mp); 1409 } 1410 1411 /* 1412 * This routine responds to T_ADDR_REQ messages. It is called by icmp_wput. 1413 * The local address is filled in if endpoint is bound. The remote address 1414 * is filled in if remote address has been precified ("connected endpoint") 1415 * (The concept of connected CLTS sockets is alien to published TPI 1416 * but we support it anyway). 1417 */ 1418 static void 1419 icmp_addr_req(queue_t *q, mblk_t *mp) 1420 { 1421 struct sockaddr *sa; 1422 mblk_t *ackmp; 1423 struct T_addr_ack *taa; 1424 icmp_t *icmp = Q_TO_ICMP(q); 1425 conn_t *connp = icmp->icmp_connp; 1426 uint_t addrlen; 1427 1428 /* Make it large enough for worst case */ 1429 ackmp = reallocb(mp, sizeof (struct T_addr_ack) + 1430 2 * sizeof (sin6_t), 1); 1431 if (ackmp == NULL) { 1432 icmp_err_ack(q, mp, TSYSERR, ENOMEM); 1433 return; 1434 } 1435 taa = (struct T_addr_ack *)ackmp->b_rptr; 1436 1437 bzero(taa, sizeof (struct T_addr_ack)); 1438 ackmp->b_wptr = (uchar_t *)&taa[1]; 1439 1440 taa->PRIM_type = T_ADDR_ACK; 1441 ackmp->b_datap->db_type = M_PCPROTO; 1442 1443 if (connp->conn_family == AF_INET) 1444 addrlen = sizeof (sin_t); 1445 else 1446 addrlen = sizeof (sin6_t); 1447 1448 mutex_enter(&connp->conn_lock); 1449 /* 1450 * Note: Following code assumes 32 bit alignment of basic 1451 * data structures like sin_t and struct T_addr_ack. 1452 */ 1453 if (icmp->icmp_state != TS_UNBND) { 1454 /* 1455 * Fill in local address first 1456 */ 1457 taa->LOCADDR_offset = sizeof (*taa); 1458 taa->LOCADDR_length = addrlen; 1459 sa = (struct sockaddr *)&taa[1]; 1460 (void) conn_getsockname(connp, sa, &addrlen); 1461 ackmp->b_wptr += addrlen; 1462 } 1463 if (icmp->icmp_state == TS_DATA_XFER) { 1464 /* 1465 * connected, fill remote address too 1466 */ 1467 taa->REMADDR_length = addrlen; 1468 /* assumed 32-bit alignment */ 1469 taa->REMADDR_offset = taa->LOCADDR_offset + taa->LOCADDR_length; 1470 sa = (struct sockaddr *)(ackmp->b_rptr + taa->REMADDR_offset); 1471 (void) conn_getpeername(connp, sa, &addrlen); 1472 ackmp->b_wptr += addrlen; 1473 } 1474 mutex_exit(&connp->conn_lock); 1475 ASSERT(ackmp->b_wptr <= ackmp->b_datap->db_lim); 1476 qreply(q, ackmp); 1477 } 1478 1479 static void 1480 icmp_copy_info(struct T_info_ack *tap, icmp_t *icmp) 1481 { 1482 conn_t *connp = icmp->icmp_connp; 1483 1484 *tap = icmp_g_t_info_ack; 1485 1486 if (connp->conn_family == AF_INET6) 1487 tap->ADDR_size = sizeof (sin6_t); 1488 else 1489 tap->ADDR_size = sizeof (sin_t); 1490 tap->CURRENT_state = icmp->icmp_state; 1491 tap->OPT_size = icmp_max_optsize; 1492 } 1493 1494 static void 1495 icmp_do_capability_ack(icmp_t *icmp, struct T_capability_ack *tcap, 1496 t_uscalar_t cap_bits1) 1497 { 1498 tcap->CAP_bits1 = 0; 1499 1500 if (cap_bits1 & TC1_INFO) { 1501 icmp_copy_info(&tcap->INFO_ack, icmp); 1502 tcap->CAP_bits1 |= TC1_INFO; 1503 } 1504 } 1505 1506 /* 1507 * This routine responds to T_CAPABILITY_REQ messages. It is called by 1508 * icmp_wput. Much of the T_CAPABILITY_ACK information is copied from 1509 * icmp_g_t_info_ack. The current state of the stream is copied from 1510 * icmp_state. 1511 */ 1512 static void 1513 icmp_capability_req(queue_t *q, mblk_t *mp) 1514 { 1515 icmp_t *icmp = Q_TO_ICMP(q); 1516 t_uscalar_t cap_bits1; 1517 struct T_capability_ack *tcap; 1518 1519 cap_bits1 = ((struct T_capability_req *)mp->b_rptr)->CAP_bits1; 1520 1521 mp = tpi_ack_alloc(mp, sizeof (struct T_capability_ack), 1522 mp->b_datap->db_type, T_CAPABILITY_ACK); 1523 if (!mp) 1524 return; 1525 1526 tcap = (struct T_capability_ack *)mp->b_rptr; 1527 1528 icmp_do_capability_ack(icmp, tcap, cap_bits1); 1529 1530 qreply(q, mp); 1531 } 1532 1533 /* 1534 * This routine responds to T_INFO_REQ messages. It is called by icmp_wput. 1535 * Most of the T_INFO_ACK information is copied from icmp_g_t_info_ack. 1536 * The current state of the stream is copied from icmp_state. 1537 */ 1538 static void 1539 icmp_info_req(queue_t *q, mblk_t *mp) 1540 { 1541 icmp_t *icmp = Q_TO_ICMP(q); 1542 1543 /* Create a T_INFO_ACK message. */ 1544 mp = tpi_ack_alloc(mp, sizeof (struct T_info_ack), M_PCPROTO, 1545 T_INFO_ACK); 1546 if (!mp) 1547 return; 1548 icmp_copy_info((struct T_info_ack *)mp->b_rptr, icmp); 1549 qreply(q, mp); 1550 } 1551 1552 static int 1553 icmp_tpi_open(queue_t *q, dev_t *devp, int flag, int sflag, cred_t *credp, 1554 int family) 1555 { 1556 conn_t *connp; 1557 dev_t conn_dev; 1558 int error; 1559 1560 /* If the stream is already open, return immediately. */ 1561 if (q->q_ptr != NULL) 1562 return (0); 1563 1564 if (sflag == MODOPEN) 1565 return (EINVAL); 1566 1567 /* 1568 * Since ICMP is not used so heavily, allocating from the small 1569 * arena should be sufficient. 1570 */ 1571 if ((conn_dev = inet_minor_alloc(ip_minor_arena_sa)) == 0) { 1572 return (EBUSY); 1573 } 1574 1575 if (flag & SO_FALLBACK) { 1576 /* 1577 * Non streams socket needs a stream to fallback to 1578 */ 1579 RD(q)->q_ptr = (void *)conn_dev; 1580 WR(q)->q_qinfo = &icmp_fallback_sock_winit; 1581 WR(q)->q_ptr = (void *)ip_minor_arena_sa; 1582 qprocson(q); 1583 return (0); 1584 } 1585 1586 connp = rawip_do_open(family, credp, &error, KM_SLEEP); 1587 if (connp == NULL) { 1588 ASSERT(error != 0); 1589 inet_minor_free(ip_minor_arena_sa, connp->conn_dev); 1590 return (error); 1591 } 1592 1593 *devp = makedevice(getemajor(*devp), (minor_t)conn_dev); 1594 connp->conn_dev = conn_dev; 1595 connp->conn_minor_arena = ip_minor_arena_sa; 1596 1597 /* 1598 * Initialize the icmp_t structure for this stream. 1599 */ 1600 q->q_ptr = connp; 1601 WR(q)->q_ptr = connp; 1602 connp->conn_rq = q; 1603 connp->conn_wq = WR(q); 1604 1605 WR(q)->q_hiwat = connp->conn_sndbuf; 1606 WR(q)->q_lowat = connp->conn_sndlowat; 1607 1608 qprocson(q); 1609 1610 /* Set the Stream head write offset. */ 1611 (void) proto_set_tx_wroff(q, connp, connp->conn_wroff); 1612 (void) proto_set_rx_hiwat(connp->conn_rq, connp, connp->conn_rcvbuf); 1613 1614 mutex_enter(&connp->conn_lock); 1615 connp->conn_state_flags &= ~CONN_INCIPIENT; 1616 mutex_exit(&connp->conn_lock); 1617 1618 icmp_bind_proto(connp->conn_icmp); 1619 1620 return (0); 1621 } 1622 1623 /* For /dev/icmp aka AF_INET open */ 1624 static int 1625 icmp_openv4(queue_t *q, dev_t *devp, int flag, int sflag, cred_t *credp) 1626 { 1627 return (icmp_tpi_open(q, devp, flag, sflag, credp, AF_INET)); 1628 } 1629 1630 /* For /dev/icmp6 aka AF_INET6 open */ 1631 static int 1632 icmp_openv6(queue_t *q, dev_t *devp, int flag, int sflag, cred_t *credp) 1633 { 1634 return (icmp_tpi_open(q, devp, flag, sflag, credp, AF_INET6)); 1635 } 1636 1637 /* 1638 * This is the open routine for icmp. It allocates a icmp_t structure for 1639 * the stream and, on the first open of the module, creates an ND table. 1640 */ 1641 static conn_t * 1642 rawip_do_open(int family, cred_t *credp, int *err, int flags) 1643 { 1644 icmp_t *icmp; 1645 conn_t *connp; 1646 zoneid_t zoneid; 1647 netstack_t *ns; 1648 icmp_stack_t *is; 1649 int len; 1650 boolean_t isv6 = B_FALSE; 1651 1652 *err = secpolicy_net_icmpaccess(credp); 1653 if (*err != 0) 1654 return (NULL); 1655 1656 if (family == AF_INET6) 1657 isv6 = B_TRUE; 1658 1659 ns = netstack_find_by_cred(credp); 1660 ASSERT(ns != NULL); 1661 is = ns->netstack_icmp; 1662 ASSERT(is != NULL); 1663 1664 /* 1665 * For exclusive stacks we set the zoneid to zero 1666 * to make ICMP operate as if in the global zone. 1667 */ 1668 if (ns->netstack_stackid != GLOBAL_NETSTACKID) 1669 zoneid = GLOBAL_ZONEID; 1670 else 1671 zoneid = crgetzoneid(credp); 1672 1673 ASSERT(flags == KM_SLEEP || flags == KM_NOSLEEP); 1674 1675 connp = ipcl_conn_create(IPCL_RAWIPCONN, flags, ns); 1676 icmp = connp->conn_icmp; 1677 1678 /* 1679 * ipcl_conn_create did a netstack_hold. Undo the hold that was 1680 * done by netstack_find_by_cred() 1681 */ 1682 netstack_rele(ns); 1683 1684 /* 1685 * Since this conn_t/icmp_t is not yet visible to anybody else we don't 1686 * need to lock anything. 1687 */ 1688 ASSERT(connp->conn_proto == IPPROTO_ICMP); 1689 ASSERT(connp->conn_icmp == icmp); 1690 ASSERT(icmp->icmp_connp == connp); 1691 1692 /* Set the initial state of the stream and the privilege status. */ 1693 icmp->icmp_state = TS_UNBND; 1694 connp->conn_ixa->ixa_flags |= IXAF_VERIFY_SOURCE; 1695 if (isv6) { 1696 connp->conn_family = AF_INET6; 1697 connp->conn_ipversion = IPV6_VERSION; 1698 connp->conn_ixa->ixa_flags &= ~IXAF_IS_IPV4; 1699 connp->conn_proto = IPPROTO_ICMPV6; 1700 /* May be changed by a SO_PROTOTYPE socket option. */ 1701 connp->conn_proto = IPPROTO_ICMPV6; 1702 connp->conn_ixa->ixa_protocol = connp->conn_proto; 1703 connp->conn_ixa->ixa_raw_cksum_offset = 2; 1704 connp->conn_default_ttl = is->is_ipv6_hoplimit; 1705 len = sizeof (ip6_t); 1706 } else { 1707 connp->conn_family = AF_INET; 1708 connp->conn_ipversion = IPV4_VERSION; 1709 connp->conn_ixa->ixa_flags |= IXAF_IS_IPV4; 1710 /* May be changed by a SO_PROTOTYPE socket option. */ 1711 connp->conn_proto = IPPROTO_ICMP; 1712 connp->conn_ixa->ixa_protocol = connp->conn_proto; 1713 connp->conn_default_ttl = is->is_ipv4_ttl; 1714 len = sizeof (ipha_t); 1715 } 1716 connp->conn_xmit_ipp.ipp_unicast_hops = connp->conn_default_ttl; 1717 1718 connp->conn_ixa->ixa_multicast_ttl = IP_DEFAULT_MULTICAST_TTL; 1719 1720 /* 1721 * For the socket of protocol IPPROTO_RAW or when IP_HDRINCL is set, 1722 * the checksum is provided in the pre-built packet. We clear 1723 * IXAF_SET_ULP_CKSUM to tell IP that the application has sent a 1724 * complete IP header and not to compute the transport checksum. 1725 */ 1726 connp->conn_ixa->ixa_flags |= IXAF_MULTICAST_LOOP | IXAF_SET_ULP_CKSUM; 1727 /* conn_allzones can not be set this early, hence no IPCL_ZONEID */ 1728 connp->conn_ixa->ixa_zoneid = zoneid; 1729 1730 connp->conn_zoneid = zoneid; 1731 1732 /* 1733 * If the caller has the process-wide flag set, then default to MAC 1734 * exempt mode. This allows read-down to unlabeled hosts. 1735 */ 1736 if (getpflags(NET_MAC_AWARE, credp) != 0) 1737 connp->conn_mac_mode = CONN_MAC_AWARE; 1738 1739 connp->conn_zone_is_global = (crgetzoneid(credp) == GLOBAL_ZONEID); 1740 1741 icmp->icmp_is = is; 1742 1743 connp->conn_rcvbuf = is->is_recv_hiwat; 1744 connp->conn_sndbuf = is->is_xmit_hiwat; 1745 connp->conn_sndlowat = is->is_xmit_lowat; 1746 connp->conn_rcvlowat = icmp_mod_info.mi_lowat; 1747 1748 connp->conn_wroff = len + is->is_wroff_extra; 1749 connp->conn_so_type = SOCK_RAW; 1750 1751 connp->conn_recv = icmp_input; 1752 connp->conn_recvicmp = icmp_icmp_input; 1753 crhold(credp); 1754 connp->conn_cred = credp; 1755 connp->conn_cpid = curproc->p_pid; 1756 connp->conn_open_time = ddi_get_lbolt64(); 1757 /* Cache things in ixa without an extra refhold */ 1758 ASSERT(!(connp->conn_ixa->ixa_free_flags & IXA_FREE_CRED)); 1759 connp->conn_ixa->ixa_cred = connp->conn_cred; 1760 connp->conn_ixa->ixa_cpid = connp->conn_cpid; 1761 if (is_system_labeled()) 1762 connp->conn_ixa->ixa_tsl = crgetlabel(connp->conn_cred); 1763 1764 connp->conn_flow_cntrld = B_FALSE; 1765 1766 if (is->is_pmtu_discovery) 1767 connp->conn_ixa->ixa_flags |= IXAF_PMTU_DISCOVERY; 1768 1769 return (connp); 1770 } 1771 1772 /* 1773 * Which ICMP options OK to set through T_UNITDATA_REQ... 1774 */ 1775 /* ARGSUSED */ 1776 static boolean_t 1777 icmp_opt_allow_udr_set(t_scalar_t level, t_scalar_t name) 1778 { 1779 return (B_TRUE); 1780 } 1781 1782 /* 1783 * This routine gets default values of certain options whose default 1784 * values are maintained by protcol specific code 1785 */ 1786 int 1787 icmp_opt_default(queue_t *q, t_scalar_t level, t_scalar_t name, uchar_t *ptr) 1788 { 1789 icmp_t *icmp = Q_TO_ICMP(q); 1790 icmp_stack_t *is = icmp->icmp_is; 1791 int *i1 = (int *)ptr; 1792 1793 switch (level) { 1794 case IPPROTO_IP: 1795 switch (name) { 1796 case IP_MULTICAST_TTL: 1797 *ptr = (uchar_t)IP_DEFAULT_MULTICAST_TTL; 1798 return (sizeof (uchar_t)); 1799 case IP_MULTICAST_LOOP: 1800 *ptr = (uchar_t)IP_DEFAULT_MULTICAST_LOOP; 1801 return (sizeof (uchar_t)); 1802 } 1803 break; 1804 case IPPROTO_IPV6: 1805 switch (name) { 1806 case IPV6_MULTICAST_HOPS: 1807 *i1 = IP_DEFAULT_MULTICAST_TTL; 1808 return (sizeof (int)); 1809 case IPV6_MULTICAST_LOOP: 1810 *i1 = IP_DEFAULT_MULTICAST_LOOP; 1811 return (sizeof (int)); 1812 case IPV6_UNICAST_HOPS: 1813 *i1 = is->is_ipv6_hoplimit; 1814 return (sizeof (int)); 1815 } 1816 break; 1817 case IPPROTO_ICMPV6: 1818 switch (name) { 1819 case ICMP6_FILTER: 1820 /* Make it look like "pass all" */ 1821 ICMP6_FILTER_SETPASSALL((icmp6_filter_t *)ptr); 1822 return (sizeof (icmp6_filter_t)); 1823 } 1824 break; 1825 } 1826 return (-1); 1827 } 1828 1829 /* 1830 * This routine retrieves the current status of socket options. 1831 * It returns the size of the option retrieved, or -1. 1832 */ 1833 int 1834 icmp_opt_get(conn_t *connp, int level, int name, uchar_t *ptr) 1835 { 1836 icmp_t *icmp = connp->conn_icmp; 1837 int *i1 = (int *)ptr; 1838 conn_opt_arg_t coas; 1839 int retval; 1840 1841 coas.coa_connp = connp; 1842 coas.coa_ixa = connp->conn_ixa; 1843 coas.coa_ipp = &connp->conn_xmit_ipp; 1844 coas.coa_ancillary = B_FALSE; 1845 coas.coa_changed = 0; 1846 1847 /* 1848 * We assume that the optcom framework has checked for the set 1849 * of levels and names that are supported, hence we don't worry 1850 * about rejecting based on that. 1851 * First check for ICMP specific handling, then pass to common routine. 1852 */ 1853 switch (level) { 1854 case IPPROTO_IP: 1855 /* 1856 * Only allow IPv4 option processing on IPv4 sockets. 1857 */ 1858 if (connp->conn_family != AF_INET) 1859 return (-1); 1860 1861 switch (name) { 1862 case IP_OPTIONS: 1863 case T_IP_OPTIONS: 1864 /* Options are passed up with each packet */ 1865 return (0); 1866 case IP_HDRINCL: 1867 mutex_enter(&connp->conn_lock); 1868 *i1 = (int)icmp->icmp_hdrincl; 1869 mutex_exit(&connp->conn_lock); 1870 return (sizeof (int)); 1871 } 1872 break; 1873 1874 case IPPROTO_IPV6: 1875 /* 1876 * Only allow IPv6 option processing on native IPv6 sockets. 1877 */ 1878 if (connp->conn_family != AF_INET6) 1879 return (-1); 1880 1881 switch (name) { 1882 case IPV6_CHECKSUM: 1883 /* 1884 * Return offset or -1 if no checksum offset. 1885 * Does not apply to IPPROTO_ICMPV6 1886 */ 1887 if (connp->conn_proto == IPPROTO_ICMPV6) 1888 return (-1); 1889 1890 mutex_enter(&connp->conn_lock); 1891 if (connp->conn_ixa->ixa_flags & IXAF_SET_RAW_CKSUM) 1892 *i1 = connp->conn_ixa->ixa_raw_cksum_offset; 1893 else 1894 *i1 = -1; 1895 mutex_exit(&connp->conn_lock); 1896 return (sizeof (int)); 1897 } 1898 break; 1899 1900 case IPPROTO_ICMPV6: 1901 /* 1902 * Only allow IPv6 option processing on native IPv6 sockets. 1903 */ 1904 if (connp->conn_family != AF_INET6) 1905 return (-1); 1906 1907 if (connp->conn_proto != IPPROTO_ICMPV6) 1908 return (-1); 1909 1910 switch (name) { 1911 case ICMP6_FILTER: 1912 mutex_enter(&connp->conn_lock); 1913 if (icmp->icmp_filter == NULL) { 1914 /* Make it look like "pass all" */ 1915 ICMP6_FILTER_SETPASSALL((icmp6_filter_t *)ptr); 1916 } else { 1917 (void) bcopy(icmp->icmp_filter, ptr, 1918 sizeof (icmp6_filter_t)); 1919 } 1920 mutex_exit(&connp->conn_lock); 1921 return (sizeof (icmp6_filter_t)); 1922 } 1923 } 1924 mutex_enter(&connp->conn_lock); 1925 retval = conn_opt_get(&coas, level, name, ptr); 1926 mutex_exit(&connp->conn_lock); 1927 return (retval); 1928 } 1929 1930 /* 1931 * This routine retrieves the current status of socket options. 1932 * It returns the size of the option retrieved, or -1. 1933 */ 1934 int 1935 icmp_tpi_opt_get(queue_t *q, int level, int name, uchar_t *ptr) 1936 { 1937 conn_t *connp = Q_TO_CONN(q); 1938 int err; 1939 1940 err = icmp_opt_get(connp, level, name, ptr); 1941 return (err); 1942 } 1943 1944 /* 1945 * This routine sets socket options. 1946 */ 1947 int 1948 icmp_do_opt_set(conn_opt_arg_t *coa, int level, int name, 1949 uint_t inlen, uchar_t *invalp, cred_t *cr, boolean_t checkonly) 1950 { 1951 conn_t *connp = coa->coa_connp; 1952 ip_xmit_attr_t *ixa = coa->coa_ixa; 1953 icmp_t *icmp = connp->conn_icmp; 1954 icmp_stack_t *is = icmp->icmp_is; 1955 int *i1 = (int *)invalp; 1956 boolean_t onoff = (*i1 == 0) ? 0 : 1; 1957 int error; 1958 1959 ASSERT(MUTEX_NOT_HELD(&coa->coa_connp->conn_lock)); 1960 1961 /* 1962 * For fixed length options, no sanity check 1963 * of passed in length is done. It is assumed *_optcom_req() 1964 * routines do the right thing. 1965 */ 1966 1967 switch (level) { 1968 case SOL_SOCKET: 1969 switch (name) { 1970 case SO_PROTOTYPE: 1971 if ((*i1 & 0xFF) != IPPROTO_ICMP && 1972 (*i1 & 0xFF) != IPPROTO_ICMPV6 && 1973 secpolicy_net_rawaccess(cr) != 0) { 1974 return (EACCES); 1975 } 1976 if (checkonly) 1977 break; 1978 1979 mutex_enter(&connp->conn_lock); 1980 connp->conn_proto = *i1 & 0xFF; 1981 ixa->ixa_protocol = connp->conn_proto; 1982 if ((connp->conn_proto == IPPROTO_RAW || 1983 connp->conn_proto == IPPROTO_IGMP) && 1984 connp->conn_family == AF_INET) { 1985 icmp->icmp_hdrincl = 1; 1986 ixa->ixa_flags &= ~IXAF_SET_ULP_CKSUM; 1987 } else if (connp->conn_proto == IPPROTO_UDP || 1988 connp->conn_proto == IPPROTO_TCP || 1989 connp->conn_proto == IPPROTO_SCTP) { 1990 /* Used by test applications like psh */ 1991 icmp->icmp_hdrincl = 0; 1992 ixa->ixa_flags &= ~IXAF_SET_ULP_CKSUM; 1993 } else { 1994 icmp->icmp_hdrincl = 0; 1995 ixa->ixa_flags |= IXAF_SET_ULP_CKSUM; 1996 } 1997 1998 if (connp->conn_family == AF_INET6 && 1999 connp->conn_proto == IPPROTO_ICMPV6) { 2000 /* Set offset for icmp6_cksum */ 2001 ixa->ixa_flags &= ~IXAF_SET_RAW_CKSUM; 2002 ixa->ixa_raw_cksum_offset = 2; 2003 } 2004 if (icmp->icmp_filter != NULL && 2005 connp->conn_proto != IPPROTO_ICMPV6) { 2006 kmem_free(icmp->icmp_filter, 2007 sizeof (icmp6_filter_t)); 2008 icmp->icmp_filter = NULL; 2009 } 2010 mutex_exit(&connp->conn_lock); 2011 2012 coa->coa_changed |= COA_HEADER_CHANGED; 2013 /* 2014 * For SCTP, we don't use icmp_bind_proto() for 2015 * raw socket binding. 2016 */ 2017 if (connp->conn_proto == IPPROTO_SCTP) 2018 return (0); 2019 2020 coa->coa_changed |= COA_ICMP_BIND_NEEDED; 2021 return (0); 2022 2023 case SO_SNDBUF: 2024 if (*i1 > is->is_max_buf) { 2025 return (ENOBUFS); 2026 } 2027 break; 2028 case SO_RCVBUF: 2029 if (*i1 > is->is_max_buf) { 2030 return (ENOBUFS); 2031 } 2032 break; 2033 } 2034 break; 2035 2036 case IPPROTO_IP: 2037 /* 2038 * Only allow IPv4 option processing on IPv4 sockets. 2039 */ 2040 if (connp->conn_family != AF_INET) 2041 return (EINVAL); 2042 2043 switch (name) { 2044 case IP_HDRINCL: 2045 if (!checkonly) { 2046 mutex_enter(&connp->conn_lock); 2047 icmp->icmp_hdrincl = onoff; 2048 if (onoff) 2049 ixa->ixa_flags &= ~IXAF_SET_ULP_CKSUM; 2050 else 2051 ixa->ixa_flags |= IXAF_SET_ULP_CKSUM; 2052 mutex_exit(&connp->conn_lock); 2053 } 2054 break; 2055 } 2056 break; 2057 2058 case IPPROTO_IPV6: 2059 if (connp->conn_family != AF_INET6) 2060 return (EINVAL); 2061 2062 switch (name) { 2063 case IPV6_CHECKSUM: 2064 /* 2065 * Integer offset into the user data of where the 2066 * checksum is located. 2067 * Offset of -1 disables option. 2068 * Does not apply to IPPROTO_ICMPV6. 2069 */ 2070 if (connp->conn_proto == IPPROTO_ICMPV6 || 2071 coa->coa_ancillary) { 2072 return (EINVAL); 2073 } 2074 if ((*i1 != -1) && ((*i1 < 0) || (*i1 & 0x1) != 0)) { 2075 /* Negative or not 16 bit aligned offset */ 2076 return (EINVAL); 2077 } 2078 if (checkonly) 2079 break; 2080 2081 mutex_enter(&connp->conn_lock); 2082 if (*i1 == -1) { 2083 ixa->ixa_flags &= ~IXAF_SET_RAW_CKSUM; 2084 ixa->ixa_raw_cksum_offset = 0; 2085 ixa->ixa_flags &= ~IXAF_SET_ULP_CKSUM; 2086 } else { 2087 ixa->ixa_flags |= IXAF_SET_RAW_CKSUM; 2088 ixa->ixa_raw_cksum_offset = *i1; 2089 ixa->ixa_flags |= IXAF_SET_ULP_CKSUM; 2090 } 2091 mutex_exit(&connp->conn_lock); 2092 break; 2093 } 2094 break; 2095 2096 case IPPROTO_ICMPV6: 2097 /* 2098 * Only allow IPv6 option processing on IPv6 sockets. 2099 */ 2100 if (connp->conn_family != AF_INET6) 2101 return (EINVAL); 2102 if (connp->conn_proto != IPPROTO_ICMPV6) 2103 return (EINVAL); 2104 2105 switch (name) { 2106 case ICMP6_FILTER: 2107 if (checkonly) 2108 break; 2109 2110 if ((inlen != 0) && 2111 (inlen != sizeof (icmp6_filter_t))) 2112 return (EINVAL); 2113 2114 mutex_enter(&connp->conn_lock); 2115 if (inlen == 0) { 2116 if (icmp->icmp_filter != NULL) { 2117 kmem_free(icmp->icmp_filter, 2118 sizeof (icmp6_filter_t)); 2119 icmp->icmp_filter = NULL; 2120 } 2121 } else { 2122 if (icmp->icmp_filter == NULL) { 2123 icmp->icmp_filter = kmem_alloc( 2124 sizeof (icmp6_filter_t), 2125 KM_NOSLEEP); 2126 if (icmp->icmp_filter == NULL) { 2127 mutex_exit(&connp->conn_lock); 2128 return (ENOBUFS); 2129 } 2130 } 2131 (void) bcopy(invalp, icmp->icmp_filter, inlen); 2132 } 2133 mutex_exit(&connp->conn_lock); 2134 break; 2135 } 2136 break; 2137 } 2138 error = conn_opt_set(coa, level, name, inlen, invalp, 2139 checkonly, cr); 2140 return (error); 2141 } 2142 2143 /* 2144 * This routine sets socket options. 2145 */ 2146 int 2147 icmp_opt_set(conn_t *connp, uint_t optset_context, int level, int name, 2148 uint_t inlen, uchar_t *invalp, uint_t *outlenp, uchar_t *outvalp, 2149 void *thisdg_attrs, cred_t *cr) 2150 { 2151 icmp_t *icmp = connp->conn_icmp; 2152 int err; 2153 conn_opt_arg_t coas, *coa; 2154 boolean_t checkonly; 2155 icmp_stack_t *is = icmp->icmp_is; 2156 2157 switch (optset_context) { 2158 case SETFN_OPTCOM_CHECKONLY: 2159 checkonly = B_TRUE; 2160 /* 2161 * Note: Implies T_CHECK semantics for T_OPTCOM_REQ 2162 * inlen != 0 implies value supplied and 2163 * we have to "pretend" to set it. 2164 * inlen == 0 implies that there is no 2165 * value part in T_CHECK request and just validation 2166 * done elsewhere should be enough, we just return here. 2167 */ 2168 if (inlen == 0) { 2169 *outlenp = 0; 2170 return (0); 2171 } 2172 break; 2173 case SETFN_OPTCOM_NEGOTIATE: 2174 checkonly = B_FALSE; 2175 break; 2176 case SETFN_UD_NEGOTIATE: 2177 case SETFN_CONN_NEGOTIATE: 2178 checkonly = B_FALSE; 2179 /* 2180 * Negotiating local and "association-related" options 2181 * through T_UNITDATA_REQ. 2182 * 2183 * Following routine can filter out ones we do not 2184 * want to be "set" this way. 2185 */ 2186 if (!icmp_opt_allow_udr_set(level, name)) { 2187 *outlenp = 0; 2188 return (EINVAL); 2189 } 2190 break; 2191 default: 2192 /* 2193 * We should never get here 2194 */ 2195 *outlenp = 0; 2196 return (EINVAL); 2197 } 2198 2199 ASSERT((optset_context != SETFN_OPTCOM_CHECKONLY) || 2200 (optset_context == SETFN_OPTCOM_CHECKONLY && inlen != 0)); 2201 2202 if (thisdg_attrs != NULL) { 2203 /* Options from T_UNITDATA_REQ */ 2204 coa = (conn_opt_arg_t *)thisdg_attrs; 2205 ASSERT(coa->coa_connp == connp); 2206 ASSERT(coa->coa_ixa != NULL); 2207 ASSERT(coa->coa_ipp != NULL); 2208 ASSERT(coa->coa_ancillary); 2209 } else { 2210 coa = &coas; 2211 coas.coa_connp = connp; 2212 /* Get a reference on conn_ixa to prevent concurrent mods */ 2213 coas.coa_ixa = conn_get_ixa(connp, B_TRUE); 2214 if (coas.coa_ixa == NULL) { 2215 *outlenp = 0; 2216 return (ENOMEM); 2217 } 2218 coas.coa_ipp = &connp->conn_xmit_ipp; 2219 coas.coa_ancillary = B_FALSE; 2220 coas.coa_changed = 0; 2221 } 2222 2223 err = icmp_do_opt_set(coa, level, name, inlen, invalp, 2224 cr, checkonly); 2225 if (err != 0) { 2226 errout: 2227 if (!coa->coa_ancillary) 2228 ixa_refrele(coa->coa_ixa); 2229 *outlenp = 0; 2230 return (err); 2231 } 2232 2233 /* 2234 * Common case of OK return with outval same as inval. 2235 */ 2236 if (invalp != outvalp) { 2237 /* don't trust bcopy for identical src/dst */ 2238 (void) bcopy(invalp, outvalp, inlen); 2239 } 2240 *outlenp = inlen; 2241 2242 /* 2243 * If this was not ancillary data, then we rebuild the headers, 2244 * update the IRE/NCE, and IPsec as needed. 2245 * Since the label depends on the destination we go through 2246 * ip_set_destination first. 2247 */ 2248 if (coa->coa_ancillary) { 2249 return (0); 2250 } 2251 2252 if (coa->coa_changed & COA_ROUTE_CHANGED) { 2253 in6_addr_t saddr, faddr, nexthop; 2254 in_port_t fport; 2255 2256 /* 2257 * We clear lastdst to make sure we pick up the change 2258 * next time sending. 2259 * If we are connected we re-cache the information. 2260 * We ignore errors to preserve BSD behavior. 2261 * Note that we don't redo IPsec policy lookup here 2262 * since the final destination (or source) didn't change. 2263 */ 2264 mutex_enter(&connp->conn_lock); 2265 connp->conn_v6lastdst = ipv6_all_zeros; 2266 2267 ip_attr_nexthop(coa->coa_ipp, coa->coa_ixa, 2268 &connp->conn_faddr_v6, &nexthop); 2269 saddr = connp->conn_saddr_v6; 2270 faddr = connp->conn_faddr_v6; 2271 fport = connp->conn_fport; 2272 mutex_exit(&connp->conn_lock); 2273 2274 if (!IN6_IS_ADDR_UNSPECIFIED(&faddr) && 2275 !IN6_IS_ADDR_V4MAPPED_ANY(&faddr)) { 2276 (void) ip_attr_connect(connp, coa->coa_ixa, 2277 &saddr, &faddr, &nexthop, fport, NULL, NULL, 2278 IPDF_ALLOW_MCBC | IPDF_VERIFY_DST); 2279 } 2280 } 2281 2282 ixa_refrele(coa->coa_ixa); 2283 2284 if (coa->coa_changed & COA_HEADER_CHANGED) { 2285 /* 2286 * Rebuild the header template if we are connected. 2287 * Otherwise clear conn_v6lastdst so we rebuild the header 2288 * in the data path. 2289 */ 2290 mutex_enter(&connp->conn_lock); 2291 if (!IN6_IS_ADDR_UNSPECIFIED(&connp->conn_faddr_v6) && 2292 !IN6_IS_ADDR_V4MAPPED_ANY(&connp->conn_faddr_v6)) { 2293 err = icmp_build_hdr_template(connp, 2294 &connp->conn_saddr_v6, &connp->conn_faddr_v6, 2295 connp->conn_flowinfo); 2296 if (err != 0) { 2297 mutex_exit(&connp->conn_lock); 2298 return (err); 2299 } 2300 } else { 2301 connp->conn_v6lastdst = ipv6_all_zeros; 2302 } 2303 mutex_exit(&connp->conn_lock); 2304 } 2305 if (coa->coa_changed & COA_RCVBUF_CHANGED) { 2306 (void) proto_set_rx_hiwat(connp->conn_rq, connp, 2307 connp->conn_rcvbuf); 2308 } 2309 if ((coa->coa_changed & COA_SNDBUF_CHANGED) && !IPCL_IS_NONSTR(connp)) { 2310 connp->conn_wq->q_hiwat = connp->conn_sndbuf; 2311 } 2312 if (coa->coa_changed & COA_WROFF_CHANGED) { 2313 /* Increase wroff if needed */ 2314 uint_t wroff; 2315 2316 mutex_enter(&connp->conn_lock); 2317 wroff = connp->conn_ht_iphc_allocated + is->is_wroff_extra; 2318 if (wroff > connp->conn_wroff) { 2319 connp->conn_wroff = wroff; 2320 mutex_exit(&connp->conn_lock); 2321 (void) proto_set_tx_wroff(connp->conn_rq, connp, wroff); 2322 } else { 2323 mutex_exit(&connp->conn_lock); 2324 } 2325 } 2326 if (coa->coa_changed & COA_ICMP_BIND_NEEDED) { 2327 icmp_bind_proto(icmp); 2328 } 2329 return (err); 2330 } 2331 2332 /* This routine sets socket options. */ 2333 int 2334 icmp_tpi_opt_set(queue_t *q, uint_t optset_context, int level, int name, 2335 uint_t inlen, uchar_t *invalp, uint_t *outlenp, uchar_t *outvalp, 2336 void *thisdg_attrs, cred_t *cr) 2337 { 2338 conn_t *connp = Q_TO_CONN(q); 2339 int error; 2340 2341 error = icmp_opt_set(connp, optset_context, level, name, inlen, invalp, 2342 outlenp, outvalp, thisdg_attrs, cr); 2343 return (error); 2344 } 2345 2346 /* 2347 * Setup IP headers. 2348 * 2349 * Note that IP_HDRINCL has ipha_protocol that is different than conn_proto, 2350 * but icmp_output_hdrincl restores ipha_protocol once we return. 2351 */ 2352 mblk_t * 2353 icmp_prepend_hdr(conn_t *connp, ip_xmit_attr_t *ixa, const ip_pkt_t *ipp, 2354 const in6_addr_t *v6src, const in6_addr_t *v6dst, uint32_t flowinfo, 2355 mblk_t *data_mp, int *errorp) 2356 { 2357 mblk_t *mp; 2358 icmp_stack_t *is = connp->conn_netstack->netstack_icmp; 2359 uint_t data_len; 2360 uint32_t cksum; 2361 2362 data_len = msgdsize(data_mp); 2363 mp = conn_prepend_hdr(ixa, ipp, v6src, v6dst, connp->conn_proto, 2364 flowinfo, 0, data_mp, data_len, is->is_wroff_extra, &cksum, errorp); 2365 if (mp == NULL) { 2366 ASSERT(*errorp != 0); 2367 return (NULL); 2368 } 2369 2370 ixa->ixa_pktlen = data_len + ixa->ixa_ip_hdr_length; 2371 2372 /* 2373 * If there was a routing option/header then conn_prepend_hdr 2374 * has massaged it and placed the pseudo-header checksum difference 2375 * in the cksum argument. 2376 * 2377 * Prepare for ICMPv6 checksum done in IP. 2378 * 2379 * We make it easy for IP to include our pseudo header 2380 * by putting our length (and any routing header adjustment) 2381 * in the ICMPv6 checksum field. 2382 * The IP source, destination, and length have already been set by 2383 * conn_prepend_hdr. 2384 */ 2385 cksum += data_len; 2386 cksum = (cksum >> 16) + (cksum & 0xFFFF); 2387 ASSERT(cksum < 0x10000); 2388 2389 if (ixa->ixa_flags & IXAF_IS_IPV4) { 2390 ipha_t *ipha = (ipha_t *)mp->b_rptr; 2391 2392 ASSERT(ntohs(ipha->ipha_length) == ixa->ixa_pktlen); 2393 } else { 2394 ip6_t *ip6h = (ip6_t *)mp->b_rptr; 2395 uint_t cksum_offset = 0; 2396 2397 ASSERT(ntohs(ip6h->ip6_plen) + IPV6_HDR_LEN == ixa->ixa_pktlen); 2398 2399 if (ixa->ixa_flags & IXAF_SET_ULP_CKSUM) { 2400 if (connp->conn_proto == IPPROTO_ICMPV6) { 2401 cksum_offset = ixa->ixa_ip_hdr_length + 2402 offsetof(icmp6_t, icmp6_cksum); 2403 } else if (ixa->ixa_flags & IXAF_SET_RAW_CKSUM) { 2404 cksum_offset = ixa->ixa_ip_hdr_length + 2405 ixa->ixa_raw_cksum_offset; 2406 } 2407 } 2408 if (cksum_offset != 0) { 2409 uint16_t *ptr; 2410 2411 /* Make sure the checksum fits in the first mblk */ 2412 if (cksum_offset + sizeof (short) > MBLKL(mp)) { 2413 mblk_t *mp1; 2414 2415 mp1 = msgpullup(mp, 2416 cksum_offset + sizeof (short)); 2417 freemsg(mp); 2418 if (mp1 == NULL) { 2419 *errorp = ENOMEM; 2420 return (NULL); 2421 } 2422 mp = mp1; 2423 ip6h = (ip6_t *)mp->b_rptr; 2424 } 2425 ptr = (uint16_t *)(mp->b_rptr + cksum_offset); 2426 *ptr = htons(cksum); 2427 } 2428 } 2429 2430 /* Note that we don't try to update wroff due to ancillary data */ 2431 return (mp); 2432 } 2433 2434 static int 2435 icmp_build_hdr_template(conn_t *connp, const in6_addr_t *v6src, 2436 const in6_addr_t *v6dst, uint32_t flowinfo) 2437 { 2438 int error; 2439 2440 ASSERT(MUTEX_HELD(&connp->conn_lock)); 2441 /* 2442 * We clear lastdst to make sure we don't use the lastdst path 2443 * next time sending since we might not have set v6dst yet. 2444 */ 2445 connp->conn_v6lastdst = ipv6_all_zeros; 2446 2447 error = conn_build_hdr_template(connp, 0, 0, v6src, v6dst, flowinfo); 2448 if (error != 0) 2449 return (error); 2450 2451 /* 2452 * Any routing header/option has been massaged. The checksum difference 2453 * is stored in conn_sum. 2454 */ 2455 return (0); 2456 } 2457 2458 static mblk_t * 2459 icmp_queue_fallback(icmp_t *icmp, mblk_t *mp) 2460 { 2461 ASSERT(MUTEX_HELD(&icmp->icmp_recv_lock)); 2462 if (IPCL_IS_NONSTR(icmp->icmp_connp)) { 2463 /* 2464 * fallback has started but messages have not been moved yet 2465 */ 2466 if (icmp->icmp_fallback_queue_head == NULL) { 2467 ASSERT(icmp->icmp_fallback_queue_tail == NULL); 2468 icmp->icmp_fallback_queue_head = mp; 2469 icmp->icmp_fallback_queue_tail = mp; 2470 } else { 2471 ASSERT(icmp->icmp_fallback_queue_tail != NULL); 2472 icmp->icmp_fallback_queue_tail->b_next = mp; 2473 icmp->icmp_fallback_queue_tail = mp; 2474 } 2475 return (NULL); 2476 } else { 2477 /* 2478 * Fallback completed, let the caller putnext() the mblk. 2479 */ 2480 return (mp); 2481 } 2482 } 2483 2484 /* 2485 * Deliver data to ULP. In case we have a socket, and it's falling back to 2486 * TPI, then we'll queue the mp for later processing. 2487 */ 2488 static void 2489 icmp_ulp_recv(conn_t *connp, mblk_t *mp, uint_t len) 2490 { 2491 if (IPCL_IS_NONSTR(connp)) { 2492 icmp_t *icmp = connp->conn_icmp; 2493 int error; 2494 2495 ASSERT(len == msgdsize(mp)); 2496 if ((*connp->conn_upcalls->su_recv) 2497 (connp->conn_upper_handle, mp, len, 0, &error, NULL) < 0) { 2498 mutex_enter(&icmp->icmp_recv_lock); 2499 if (error == ENOSPC) { 2500 /* 2501 * let's confirm while holding the lock 2502 */ 2503 if ((*connp->conn_upcalls->su_recv) 2504 (connp->conn_upper_handle, NULL, 0, 0, 2505 &error, NULL) < 0) { 2506 ASSERT(error == ENOSPC); 2507 if (error == ENOSPC) { 2508 connp->conn_flow_cntrld = 2509 B_TRUE; 2510 } 2511 } 2512 mutex_exit(&icmp->icmp_recv_lock); 2513 } else { 2514 ASSERT(error == EOPNOTSUPP); 2515 mp = icmp_queue_fallback(icmp, mp); 2516 mutex_exit(&icmp->icmp_recv_lock); 2517 if (mp != NULL) 2518 putnext(connp->conn_rq, mp); 2519 } 2520 } 2521 ASSERT(MUTEX_NOT_HELD(&icmp->icmp_recv_lock)); 2522 } else { 2523 putnext(connp->conn_rq, mp); 2524 } 2525 } 2526 2527 /* 2528 * This is the inbound data path. 2529 * IP has already pulled up the IP headers and verified alignment 2530 * etc. 2531 */ 2532 /* ARGSUSED2 */ 2533 static void 2534 icmp_input(void *arg1, mblk_t *mp, void *arg2, ip_recv_attr_t *ira) 2535 { 2536 conn_t *connp = (conn_t *)arg1; 2537 struct T_unitdata_ind *tudi; 2538 uchar_t *rptr; /* Pointer to IP header */ 2539 int ip_hdr_length; 2540 int udi_size; /* Size of T_unitdata_ind */ 2541 int pkt_len; 2542 icmp_t *icmp; 2543 ip_pkt_t ipps; 2544 ip6_t *ip6h; 2545 mblk_t *mp1; 2546 crb_t recv_ancillary; 2547 icmp_stack_t *is; 2548 sin_t *sin; 2549 sin6_t *sin6; 2550 ipha_t *ipha; 2551 2552 ASSERT(connp->conn_flags & IPCL_RAWIPCONN); 2553 2554 icmp = connp->conn_icmp; 2555 is = icmp->icmp_is; 2556 rptr = mp->b_rptr; 2557 2558 ASSERT(DB_TYPE(mp) == M_DATA); 2559 ASSERT(OK_32PTR(rptr)); 2560 ASSERT(ira->ira_pktlen == msgdsize(mp)); 2561 pkt_len = ira->ira_pktlen; 2562 2563 /* 2564 * Get a snapshot of these and allow other threads to change 2565 * them after that. We need the same recv_ancillary when determining 2566 * the size as when adding the ancillary data items. 2567 */ 2568 mutex_enter(&connp->conn_lock); 2569 recv_ancillary = connp->conn_recv_ancillary; 2570 mutex_exit(&connp->conn_lock); 2571 2572 ip_hdr_length = ira->ira_ip_hdr_length; 2573 ASSERT(MBLKL(mp) >= ip_hdr_length); /* IP did a pullup */ 2574 2575 /* Initialize regardless of IP version */ 2576 ipps.ipp_fields = 0; 2577 2578 if (ira->ira_flags & IRAF_IS_IPV4) { 2579 ASSERT(IPH_HDR_VERSION(rptr) == IPV4_VERSION); 2580 ASSERT(MBLKL(mp) >= sizeof (ipha_t)); 2581 ASSERT(ira->ira_ip_hdr_length == IPH_HDR_LENGTH(rptr)); 2582 2583 ipha = (ipha_t *)mp->b_rptr; 2584 if (recv_ancillary.crb_all != 0) 2585 (void) ip_find_hdr_v4(ipha, &ipps, B_FALSE); 2586 2587 /* 2588 * BSD for some reason adjusts ipha_length to exclude the 2589 * IP header length. We do the same. 2590 */ 2591 if (is->is_bsd_compat) { 2592 ushort_t len; 2593 2594 len = ntohs(ipha->ipha_length); 2595 if (mp->b_datap->db_ref > 1) { 2596 /* 2597 * Allocate a new IP header so that we can 2598 * modify ipha_length. 2599 */ 2600 mblk_t *mp1; 2601 2602 mp1 = allocb(ip_hdr_length, BPRI_MED); 2603 if (mp1 == NULL) { 2604 freemsg(mp); 2605 BUMP_MIB(&is->is_rawip_mib, 2606 rawipInErrors); 2607 return; 2608 } 2609 bcopy(rptr, mp1->b_rptr, ip_hdr_length); 2610 mp->b_rptr = rptr + ip_hdr_length; 2611 rptr = mp1->b_rptr; 2612 ipha = (ipha_t *)rptr; 2613 mp1->b_cont = mp; 2614 mp1->b_wptr = rptr + ip_hdr_length; 2615 mp = mp1; 2616 } 2617 len -= ip_hdr_length; 2618 ipha->ipha_length = htons(len); 2619 } 2620 2621 /* 2622 * For RAW sockets we not pass ICMP/IPv4 packets to AF_INET6 2623 * sockets. This is ensured by icmp_bind and the IP fanout code. 2624 */ 2625 ASSERT(connp->conn_family == AF_INET); 2626 2627 /* 2628 * This is the inbound data path. Packets are passed upstream 2629 * as T_UNITDATA_IND messages with full IPv4 headers still 2630 * attached. 2631 */ 2632 2633 /* 2634 * Normally only send up the source address. 2635 * If any ancillary data items are wanted we add those. 2636 */ 2637 udi_size = sizeof (struct T_unitdata_ind) + sizeof (sin_t); 2638 if (recv_ancillary.crb_all != 0) { 2639 udi_size += conn_recvancillary_size(connp, 2640 recv_ancillary, ira, mp, &ipps); 2641 } 2642 2643 /* Allocate a message block for the T_UNITDATA_IND structure. */ 2644 mp1 = allocb(udi_size, BPRI_MED); 2645 if (mp1 == NULL) { 2646 freemsg(mp); 2647 BUMP_MIB(&is->is_rawip_mib, rawipInErrors); 2648 return; 2649 } 2650 mp1->b_cont = mp; 2651 tudi = (struct T_unitdata_ind *)mp1->b_rptr; 2652 mp1->b_datap->db_type = M_PROTO; 2653 mp1->b_wptr = (uchar_t *)tudi + udi_size; 2654 tudi->PRIM_type = T_UNITDATA_IND; 2655 tudi->SRC_length = sizeof (sin_t); 2656 tudi->SRC_offset = sizeof (struct T_unitdata_ind); 2657 sin = (sin_t *)&tudi[1]; 2658 *sin = sin_null; 2659 sin->sin_family = AF_INET; 2660 sin->sin_addr.s_addr = ipha->ipha_src; 2661 *(uint32_t *)&sin->sin_zero[0] = 0; 2662 *(uint32_t *)&sin->sin_zero[4] = 0; 2663 tudi->OPT_offset = sizeof (struct T_unitdata_ind) + 2664 sizeof (sin_t); 2665 udi_size -= (sizeof (struct T_unitdata_ind) + sizeof (sin_t)); 2666 tudi->OPT_length = udi_size; 2667 2668 /* 2669 * Add options if IP_RECVIF etc is set 2670 */ 2671 if (udi_size != 0) { 2672 conn_recvancillary_add(connp, recv_ancillary, ira, 2673 &ipps, (uchar_t *)&sin[1], udi_size); 2674 } 2675 goto deliver; 2676 } 2677 2678 ASSERT(IPH_HDR_VERSION(rptr) == IPV6_VERSION); 2679 /* 2680 * IPv6 packets can only be received by applications 2681 * that are prepared to receive IPv6 addresses. 2682 * The IP fanout must ensure this. 2683 */ 2684 ASSERT(connp->conn_family == AF_INET6); 2685 2686 /* 2687 * Handle IPv6 packets. We don't pass up the IP headers with the 2688 * payload for IPv6. 2689 */ 2690 2691 ip6h = (ip6_t *)rptr; 2692 if (recv_ancillary.crb_all != 0) { 2693 /* 2694 * Call on ip_find_hdr_v6 which gets individual lenghts of 2695 * extension headers (and pointers to them). 2696 */ 2697 uint8_t nexthdr; 2698 2699 /* We don't care about the length or nextheader. */ 2700 (void) ip_find_hdr_v6(mp, ip6h, B_TRUE, &ipps, &nexthdr); 2701 2702 /* 2703 * We do not pass up hop-by-hop options or any other 2704 * extension header as part of the packet. Applications 2705 * that want to see them have to specify IPV6_RECV* socket 2706 * options. And conn_recvancillary_size/add explicitly 2707 * drops the TX option from IPV6_HOPOPTS as it does for UDP. 2708 * 2709 * If we had multilevel ICMP sockets, then we'd want to 2710 * modify conn_recvancillary_size/add to 2711 * allow the user to see the label. 2712 */ 2713 } 2714 2715 /* 2716 * Check a filter for ICMPv6 types if needed. 2717 * Verify raw checksums if needed. 2718 */ 2719 mutex_enter(&connp->conn_lock); 2720 if (icmp->icmp_filter != NULL) { 2721 int type; 2722 2723 /* Assumes that IP has done the pullupmsg */ 2724 type = mp->b_rptr[ip_hdr_length]; 2725 2726 ASSERT(mp->b_rptr + ip_hdr_length <= mp->b_wptr); 2727 if (ICMP6_FILTER_WILLBLOCK(type, icmp->icmp_filter)) { 2728 mutex_exit(&connp->conn_lock); 2729 freemsg(mp); 2730 return; 2731 } 2732 } 2733 if (connp->conn_ixa->ixa_flags & IXAF_SET_RAW_CKSUM) { 2734 /* Checksum */ 2735 uint16_t *up; 2736 uint32_t sum; 2737 int remlen; 2738 2739 up = (uint16_t *)&ip6h->ip6_src; 2740 2741 remlen = msgdsize(mp) - ip_hdr_length; 2742 sum = htons(connp->conn_proto + remlen) 2743 + up[0] + up[1] + up[2] + up[3] 2744 + up[4] + up[5] + up[6] + up[7] 2745 + up[8] + up[9] + up[10] + up[11] 2746 + up[12] + up[13] + up[14] + up[15]; 2747 sum = (sum & 0xffff) + (sum >> 16); 2748 sum = IP_CSUM(mp, ip_hdr_length, sum); 2749 if (sum != 0) { 2750 /* IPv6 RAW checksum failed */ 2751 ip0dbg(("icmp_rput: RAW checksum failed %x\n", sum)); 2752 mutex_exit(&connp->conn_lock); 2753 freemsg(mp); 2754 BUMP_MIB(&is->is_rawip_mib, rawipInCksumErrs); 2755 return; 2756 } 2757 } 2758 mutex_exit(&connp->conn_lock); 2759 2760 udi_size = sizeof (struct T_unitdata_ind) + sizeof (sin6_t); 2761 2762 if (recv_ancillary.crb_all != 0) { 2763 udi_size += conn_recvancillary_size(connp, 2764 recv_ancillary, ira, mp, &ipps); 2765 } 2766 2767 mp1 = allocb(udi_size, BPRI_MED); 2768 if (mp1 == NULL) { 2769 freemsg(mp); 2770 BUMP_MIB(&is->is_rawip_mib, rawipInErrors); 2771 return; 2772 } 2773 mp1->b_cont = mp; 2774 mp1->b_datap->db_type = M_PROTO; 2775 tudi = (struct T_unitdata_ind *)mp1->b_rptr; 2776 mp1->b_wptr = (uchar_t *)tudi + udi_size; 2777 tudi->PRIM_type = T_UNITDATA_IND; 2778 tudi->SRC_length = sizeof (sin6_t); 2779 tudi->SRC_offset = sizeof (struct T_unitdata_ind); 2780 tudi->OPT_offset = sizeof (struct T_unitdata_ind) + sizeof (sin6_t); 2781 udi_size -= (sizeof (struct T_unitdata_ind) + sizeof (sin6_t)); 2782 tudi->OPT_length = udi_size; 2783 sin6 = (sin6_t *)&tudi[1]; 2784 *sin6 = sin6_null; 2785 sin6->sin6_port = 0; 2786 sin6->sin6_family = AF_INET6; 2787 2788 sin6->sin6_addr = ip6h->ip6_src; 2789 /* No sin6_flowinfo per API */ 2790 sin6->sin6_flowinfo = 0; 2791 /* For link-scope pass up scope id */ 2792 if (IN6_IS_ADDR_LINKSCOPE(&ip6h->ip6_src)) 2793 sin6->sin6_scope_id = ira->ira_ruifindex; 2794 else 2795 sin6->sin6_scope_id = 0; 2796 sin6->__sin6_src_id = ip_srcid_find_addr(&ip6h->ip6_dst, 2797 IPCL_ZONEID(connp), is->is_netstack); 2798 2799 if (udi_size != 0) { 2800 conn_recvancillary_add(connp, recv_ancillary, ira, 2801 &ipps, (uchar_t *)&sin6[1], udi_size); 2802 } 2803 2804 /* Skip all the IPv6 headers per API */ 2805 mp->b_rptr += ip_hdr_length; 2806 pkt_len -= ip_hdr_length; 2807 2808 deliver: 2809 BUMP_MIB(&is->is_rawip_mib, rawipInDatagrams); 2810 icmp_ulp_recv(connp, mp1, pkt_len); 2811 } 2812 2813 /* 2814 * return SNMP stuff in buffer in mpdata. We don't hold any lock and report 2815 * information that can be changing beneath us. 2816 */ 2817 mblk_t * 2818 icmp_snmp_get(queue_t *q, mblk_t *mpctl) 2819 { 2820 mblk_t *mpdata; 2821 struct opthdr *optp; 2822 conn_t *connp = Q_TO_CONN(q); 2823 icmp_stack_t *is = connp->conn_netstack->netstack_icmp; 2824 mblk_t *mp2ctl; 2825 2826 /* 2827 * make a copy of the original message 2828 */ 2829 mp2ctl = copymsg(mpctl); 2830 2831 if (mpctl == NULL || 2832 (mpdata = mpctl->b_cont) == NULL) { 2833 freemsg(mpctl); 2834 freemsg(mp2ctl); 2835 return (0); 2836 } 2837 2838 /* fixed length structure for IPv4 and IPv6 counters */ 2839 optp = (struct opthdr *)&mpctl->b_rptr[sizeof (struct T_optmgmt_ack)]; 2840 optp->level = EXPER_RAWIP; 2841 optp->name = 0; 2842 (void) snmp_append_data(mpdata, (char *)&is->is_rawip_mib, 2843 sizeof (is->is_rawip_mib)); 2844 optp->len = msgdsize(mpdata); 2845 qreply(q, mpctl); 2846 2847 return (mp2ctl); 2848 } 2849 2850 /* 2851 * Return 0 if invalid set request, 1 otherwise, including non-rawip requests. 2852 * TODO: If this ever actually tries to set anything, it needs to be 2853 * to do the appropriate locking. 2854 */ 2855 /* ARGSUSED */ 2856 int 2857 icmp_snmp_set(queue_t *q, t_scalar_t level, t_scalar_t name, 2858 uchar_t *ptr, int len) 2859 { 2860 switch (level) { 2861 case EXPER_RAWIP: 2862 return (0); 2863 default: 2864 return (1); 2865 } 2866 } 2867 2868 /* 2869 * This routine creates a T_UDERROR_IND message and passes it upstream. 2870 * The address and options are copied from the T_UNITDATA_REQ message 2871 * passed in mp. This message is freed. 2872 */ 2873 static void 2874 icmp_ud_err(queue_t *q, mblk_t *mp, t_scalar_t err) 2875 { 2876 struct T_unitdata_req *tudr; 2877 mblk_t *mp1; 2878 uchar_t *destaddr; 2879 t_scalar_t destlen; 2880 uchar_t *optaddr; 2881 t_scalar_t optlen; 2882 2883 if ((mp->b_wptr < mp->b_rptr) || 2884 (MBLKL(mp)) < sizeof (struct T_unitdata_req)) { 2885 goto done; 2886 } 2887 tudr = (struct T_unitdata_req *)mp->b_rptr; 2888 destaddr = mp->b_rptr + tudr->DEST_offset; 2889 if (destaddr < mp->b_rptr || destaddr >= mp->b_wptr || 2890 destaddr + tudr->DEST_length < mp->b_rptr || 2891 destaddr + tudr->DEST_length > mp->b_wptr) { 2892 goto done; 2893 } 2894 optaddr = mp->b_rptr + tudr->OPT_offset; 2895 if (optaddr < mp->b_rptr || optaddr >= mp->b_wptr || 2896 optaddr + tudr->OPT_length < mp->b_rptr || 2897 optaddr + tudr->OPT_length > mp->b_wptr) { 2898 goto done; 2899 } 2900 destlen = tudr->DEST_length; 2901 optlen = tudr->OPT_length; 2902 2903 mp1 = mi_tpi_uderror_ind((char *)destaddr, destlen, 2904 (char *)optaddr, optlen, err); 2905 if (mp1 != NULL) 2906 qreply(q, mp1); 2907 2908 done: 2909 freemsg(mp); 2910 } 2911 2912 static int 2913 rawip_do_unbind(conn_t *connp) 2914 { 2915 icmp_t *icmp = connp->conn_icmp; 2916 2917 mutex_enter(&connp->conn_lock); 2918 /* If a bind has not been done, we can't unbind. */ 2919 if (icmp->icmp_state == TS_UNBND) { 2920 mutex_exit(&connp->conn_lock); 2921 return (-TOUTSTATE); 2922 } 2923 connp->conn_saddr_v6 = ipv6_all_zeros; 2924 connp->conn_bound_addr_v6 = ipv6_all_zeros; 2925 connp->conn_laddr_v6 = ipv6_all_zeros; 2926 connp->conn_mcbc_bind = B_FALSE; 2927 connp->conn_lport = 0; 2928 connp->conn_fport = 0; 2929 /* In case we were also connected */ 2930 connp->conn_faddr_v6 = ipv6_all_zeros; 2931 connp->conn_v6lastdst = ipv6_all_zeros; 2932 2933 icmp->icmp_state = TS_UNBND; 2934 2935 (void) icmp_build_hdr_template(connp, &connp->conn_saddr_v6, 2936 &connp->conn_faddr_v6, connp->conn_flowinfo); 2937 mutex_exit(&connp->conn_lock); 2938 2939 ip_unbind(connp); 2940 return (0); 2941 } 2942 2943 /* 2944 * This routine is called by icmp_wput to handle T_UNBIND_REQ messages. 2945 * After some error checking, the message is passed downstream to ip. 2946 */ 2947 static void 2948 icmp_tpi_unbind(queue_t *q, mblk_t *mp) 2949 { 2950 conn_t *connp = Q_TO_CONN(q); 2951 int error; 2952 2953 ASSERT(mp->b_cont == NULL); 2954 error = rawip_do_unbind(connp); 2955 if (error) { 2956 if (error < 0) { 2957 icmp_err_ack(q, mp, -error, 0); 2958 } else { 2959 icmp_err_ack(q, mp, 0, error); 2960 } 2961 return; 2962 } 2963 2964 /* 2965 * Convert mp into a T_OK_ACK 2966 */ 2967 2968 mp = mi_tpi_ok_ack_alloc(mp); 2969 2970 /* 2971 * should not happen in practice... T_OK_ACK is smaller than the 2972 * original message. 2973 */ 2974 ASSERT(mp != NULL); 2975 ASSERT(((struct T_ok_ack *)mp->b_rptr)->PRIM_type == T_OK_ACK); 2976 qreply(q, mp); 2977 } 2978 2979 /* 2980 * Process IPv4 packets that already include an IP header. 2981 * Used when IP_HDRINCL has been set (implicit for IPPROTO_RAW and 2982 * IPPROTO_IGMP). 2983 * In this case we ignore the address and any options in the T_UNITDATA_REQ. 2984 * 2985 * The packet is assumed to have a base (20 byte) IP header followed 2986 * by the upper-layer protocol. We include any IP_OPTIONS including a 2987 * CIPSO label but otherwise preserve the base IP header. 2988 */ 2989 static int 2990 icmp_output_hdrincl(conn_t *connp, mblk_t *mp, cred_t *cr, pid_t pid) 2991 { 2992 icmp_t *icmp = connp->conn_icmp; 2993 icmp_stack_t *is = icmp->icmp_is; 2994 ipha_t iphas; 2995 ipha_t *ipha; 2996 int ip_hdr_length; 2997 int tp_hdr_len; 2998 ip_xmit_attr_t *ixa; 2999 ip_pkt_t *ipp; 3000 in6_addr_t v6src; 3001 in6_addr_t v6dst; 3002 in6_addr_t v6nexthop; 3003 int error; 3004 boolean_t do_ipsec; 3005 3006 /* 3007 * We need an exclusive copy of conn_ixa since the included IP 3008 * header could have any destination. 3009 * That copy has no pointers hence we 3010 * need to set them up once we've parsed the ancillary data. 3011 */ 3012 ixa = conn_get_ixa_exclusive(connp); 3013 if (ixa == NULL) { 3014 BUMP_MIB(&is->is_rawip_mib, rawipOutErrors); 3015 freemsg(mp); 3016 return (ENOMEM); 3017 } 3018 ASSERT(cr != NULL); 3019 /* 3020 * Caller has a reference on cr; from db_credp or because we 3021 * are running in process context. 3022 */ 3023 ASSERT(!(ixa->ixa_free_flags & IXA_FREE_CRED)); 3024 ixa->ixa_cred = cr; 3025 ixa->ixa_cpid = pid; 3026 if (is_system_labeled()) { 3027 /* We need to restart with a label based on the cred */ 3028 ip_xmit_attr_restore_tsl(ixa, ixa->ixa_cred); 3029 } 3030 3031 /* In case previous destination was multicast or multirt */ 3032 ip_attr_newdst(ixa); 3033 3034 /* Get a copy of conn_xmit_ipp since the TX label might change it */ 3035 ipp = kmem_zalloc(sizeof (*ipp), KM_NOSLEEP); 3036 if (ipp == NULL) { 3037 ASSERT(!(ixa->ixa_free_flags & IXA_FREE_CRED)); 3038 ixa->ixa_cred = connp->conn_cred; /* Restore */ 3039 ixa->ixa_cpid = connp->conn_cpid; 3040 ixa_refrele(ixa); 3041 BUMP_MIB(&is->is_rawip_mib, rawipOutErrors); 3042 freemsg(mp); 3043 return (ENOMEM); 3044 } 3045 mutex_enter(&connp->conn_lock); 3046 error = ip_pkt_copy(&connp->conn_xmit_ipp, ipp, KM_NOSLEEP); 3047 mutex_exit(&connp->conn_lock); 3048 if (error != 0) { 3049 BUMP_MIB(&is->is_rawip_mib, rawipOutErrors); 3050 freemsg(mp); 3051 goto done; 3052 } 3053 3054 /* Sanity check length of packet */ 3055 ipha = (ipha_t *)mp->b_rptr; 3056 3057 ip_hdr_length = IP_SIMPLE_HDR_LENGTH; 3058 if ((mp->b_wptr - mp->b_rptr) < IP_SIMPLE_HDR_LENGTH) { 3059 if (!pullupmsg(mp, IP_SIMPLE_HDR_LENGTH)) { 3060 BUMP_MIB(&is->is_rawip_mib, rawipOutErrors); 3061 freemsg(mp); 3062 goto done; 3063 } 3064 ipha = (ipha_t *)mp->b_rptr; 3065 } 3066 ipha->ipha_version_and_hdr_length = 3067 (IP_VERSION<<4) | (ip_hdr_length>>2); 3068 3069 /* 3070 * We set IXAF_DONTFRAG if the application set DF which makes 3071 * IP not fragment. 3072 */ 3073 ipha->ipha_fragment_offset_and_flags &= htons(IPH_DF); 3074 if (ipha->ipha_fragment_offset_and_flags & htons(IPH_DF)) 3075 ixa->ixa_flags |= (IXAF_DONTFRAG | IXAF_PMTU_IPV4_DF); 3076 else 3077 ixa->ixa_flags &= ~(IXAF_DONTFRAG | IXAF_PMTU_IPV4_DF); 3078 3079 /* Even for multicast and broadcast we honor the apps ttl */ 3080 ixa->ixa_flags |= IXAF_NO_TTL_CHANGE; 3081 3082 /* 3083 * No source verification for non-local addresses 3084 */ 3085 if (ipha->ipha_src != INADDR_ANY && 3086 ip_laddr_verify_v4(ipha->ipha_src, ixa->ixa_zoneid, 3087 is->is_netstack->netstack_ip, B_FALSE) 3088 != IPVL_UNICAST_UP) { 3089 ixa->ixa_flags &= ~IXAF_VERIFY_SOURCE; 3090 } 3091 3092 if (ipha->ipha_dst == INADDR_ANY) 3093 ipha->ipha_dst = htonl(INADDR_LOOPBACK); 3094 3095 IN6_IPADDR_TO_V4MAPPED(ipha->ipha_src, &v6src); 3096 IN6_IPADDR_TO_V4MAPPED(ipha->ipha_dst, &v6dst); 3097 3098 /* Defer IPsec if it might need to look at ICMP type/code */ 3099 do_ipsec = ipha->ipha_protocol != IPPROTO_ICMP; 3100 ixa->ixa_flags |= IXAF_IS_IPV4; 3101 3102 ip_attr_nexthop(ipp, ixa, &v6dst, &v6nexthop); 3103 error = ip_attr_connect(connp, ixa, &v6src, &v6dst, &v6nexthop, 3104 connp->conn_fport, &v6src, NULL, IPDF_ALLOW_MCBC | IPDF_VERIFY_DST | 3105 (do_ipsec ? IPDF_IPSEC : 0)); 3106 switch (error) { 3107 case 0: 3108 break; 3109 case EADDRNOTAVAIL: 3110 /* 3111 * IXAF_VERIFY_SOURCE tells us to pick a better source. 3112 * Don't have the application see that errno 3113 */ 3114 error = ENETUNREACH; 3115 goto failed; 3116 case ENETDOWN: 3117 /* 3118 * Have !ipif_addr_ready address; drop packet silently 3119 * until we can get applications to not send until we 3120 * are ready. 3121 */ 3122 error = 0; 3123 goto failed; 3124 case EHOSTUNREACH: 3125 case ENETUNREACH: 3126 if (ixa->ixa_ire != NULL) { 3127 /* 3128 * Let conn_ip_output/ire_send_noroute return 3129 * the error and send any local ICMP error. 3130 */ 3131 error = 0; 3132 break; 3133 } 3134 /* FALLTHRU */ 3135 default: 3136 failed: 3137 freemsg(mp); 3138 BUMP_MIB(&is->is_rawip_mib, rawipOutErrors); 3139 goto done; 3140 } 3141 if (ipha->ipha_src == INADDR_ANY) 3142 IN6_V4MAPPED_TO_IPADDR(&v6src, ipha->ipha_src); 3143 3144 /* 3145 * We might be going to a different destination than last time, 3146 * thus check that TX allows the communication and compute any 3147 * needed label. 3148 * 3149 * TSOL Note: We have an exclusive ipp and ixa for this thread so we 3150 * don't have to worry about concurrent threads. 3151 */ 3152 if (is_system_labeled()) { 3153 /* 3154 * Check whether Trusted Solaris policy allows communication 3155 * with this host, and pretend that the destination is 3156 * unreachable if not. 3157 * Compute any needed label and place it in ipp_label_v4/v6. 3158 * 3159 * Later conn_build_hdr_template/conn_prepend_hdr takes 3160 * ipp_label_v4/v6 to form the packet. 3161 * 3162 * Tsol note: We have ipp structure local to this thread so 3163 * no locking is needed. 3164 */ 3165 error = conn_update_label(connp, ixa, &v6dst, ipp); 3166 if (error != 0) { 3167 freemsg(mp); 3168 BUMP_MIB(&is->is_rawip_mib, rawipOutErrors); 3169 goto done; 3170 } 3171 } 3172 3173 /* 3174 * Save away a copy of the IPv4 header the application passed down 3175 * and then prepend an IPv4 header complete with any IP options 3176 * including label. 3177 * We need a struct copy since icmp_prepend_hdr will reuse the available 3178 * space in the mblk. 3179 */ 3180 iphas = *ipha; 3181 mp->b_rptr += IP_SIMPLE_HDR_LENGTH; 3182 3183 mp = icmp_prepend_hdr(connp, ixa, ipp, &v6src, &v6dst, 0, mp, &error); 3184 if (mp == NULL) { 3185 BUMP_MIB(&is->is_rawip_mib, rawipOutErrors); 3186 ASSERT(error != 0); 3187 goto done; 3188 } 3189 if (ixa->ixa_pktlen > IP_MAXPACKET) { 3190 error = EMSGSIZE; 3191 BUMP_MIB(&is->is_rawip_mib, rawipOutErrors); 3192 freemsg(mp); 3193 goto done; 3194 } 3195 /* Restore key parts of the header that the application passed down */ 3196 ipha = (ipha_t *)mp->b_rptr; 3197 ipha->ipha_type_of_service = iphas.ipha_type_of_service; 3198 ipha->ipha_ident = iphas.ipha_ident; 3199 ipha->ipha_fragment_offset_and_flags = 3200 iphas.ipha_fragment_offset_and_flags; 3201 ipha->ipha_ttl = iphas.ipha_ttl; 3202 ipha->ipha_protocol = iphas.ipha_protocol; 3203 ipha->ipha_src = iphas.ipha_src; 3204 ipha->ipha_dst = iphas.ipha_dst; 3205 3206 ixa->ixa_protocol = ipha->ipha_protocol; 3207 3208 /* 3209 * Make sure that the IP header plus any transport header that is 3210 * checksumed by ip_output is in the first mblk. (ip_output assumes 3211 * that at least the checksum field is in the first mblk.) 3212 */ 3213 switch (ipha->ipha_protocol) { 3214 case IPPROTO_UDP: 3215 tp_hdr_len = 8; 3216 break; 3217 case IPPROTO_TCP: 3218 tp_hdr_len = 20; 3219 break; 3220 default: 3221 tp_hdr_len = 0; 3222 break; 3223 } 3224 ip_hdr_length = IPH_HDR_LENGTH(ipha); 3225 if (mp->b_wptr - mp->b_rptr < ip_hdr_length + tp_hdr_len) { 3226 if (!pullupmsg(mp, ip_hdr_length + tp_hdr_len)) { 3227 BUMP_MIB(&is->is_rawip_mib, rawipOutErrors); 3228 if (mp->b_cont == NULL) 3229 error = EINVAL; 3230 else 3231 error = ENOMEM; 3232 freemsg(mp); 3233 goto done; 3234 } 3235 } 3236 3237 if (!do_ipsec) { 3238 /* Policy might differ for different ICMP type/code */ 3239 if (ixa->ixa_ipsec_policy != NULL) { 3240 IPPOL_REFRELE(ixa->ixa_ipsec_policy); 3241 ixa->ixa_ipsec_policy = NULL; 3242 ixa->ixa_flags &= ~IXAF_IPSEC_SECURE; 3243 } 3244 mp = ip_output_attach_policy(mp, ipha, NULL, connp, ixa); 3245 if (mp == NULL) { 3246 BUMP_MIB(&is->is_rawip_mib, rawipOutErrors); 3247 error = EHOSTUNREACH; /* IPsec policy failure */ 3248 goto done; 3249 } 3250 } 3251 3252 /* We're done. Pass the packet to ip. */ 3253 BUMP_MIB(&is->is_rawip_mib, rawipOutDatagrams); 3254 3255 error = conn_ip_output(mp, ixa); 3256 /* No rawipOutErrors if an error since IP increases its error counter */ 3257 switch (error) { 3258 case 0: 3259 break; 3260 case EWOULDBLOCK: 3261 (void) ixa_check_drain_insert(connp, ixa); 3262 error = 0; 3263 break; 3264 case EADDRNOTAVAIL: 3265 /* 3266 * IXAF_VERIFY_SOURCE tells us to pick a better source. 3267 * Don't have the application see that errno 3268 */ 3269 error = ENETUNREACH; 3270 break; 3271 } 3272 done: 3273 ASSERT(!(ixa->ixa_free_flags & IXA_FREE_CRED)); 3274 ixa->ixa_cred = connp->conn_cred; /* Restore */ 3275 ixa->ixa_cpid = connp->conn_cpid; 3276 ixa_refrele(ixa); 3277 ip_pkt_free(ipp); 3278 kmem_free(ipp, sizeof (*ipp)); 3279 return (error); 3280 } 3281 3282 static mblk_t * 3283 icmp_output_attach_policy(mblk_t *mp, conn_t *connp, ip_xmit_attr_t *ixa) 3284 { 3285 ipha_t *ipha = NULL; 3286 ip6_t *ip6h = NULL; 3287 3288 if (ixa->ixa_flags & IXAF_IS_IPV4) 3289 ipha = (ipha_t *)mp->b_rptr; 3290 else 3291 ip6h = (ip6_t *)mp->b_rptr; 3292 3293 if (ixa->ixa_ipsec_policy != NULL) { 3294 IPPOL_REFRELE(ixa->ixa_ipsec_policy); 3295 ixa->ixa_ipsec_policy = NULL; 3296 ixa->ixa_flags &= ~IXAF_IPSEC_SECURE; 3297 } 3298 return (ip_output_attach_policy(mp, ipha, ip6h, connp, ixa)); 3299 } 3300 3301 /* 3302 * Handle T_UNITDATA_REQ with options. Both IPv4 and IPv6 3303 * Either tudr_mp or msg is set. If tudr_mp we take ancillary data from 3304 * the TPI options, otherwise we take them from msg_control. 3305 * If both sin and sin6 is set it is a connected socket and we use conn_faddr. 3306 * Always consumes mp; never consumes tudr_mp. 3307 */ 3308 static int 3309 icmp_output_ancillary(conn_t *connp, sin_t *sin, sin6_t *sin6, mblk_t *mp, 3310 mblk_t *tudr_mp, struct nmsghdr *msg, cred_t *cr, pid_t pid) 3311 { 3312 icmp_t *icmp = connp->conn_icmp; 3313 icmp_stack_t *is = icmp->icmp_is; 3314 int error; 3315 ip_xmit_attr_t *ixa; 3316 ip_pkt_t *ipp; 3317 in6_addr_t v6src; 3318 in6_addr_t v6dst; 3319 in6_addr_t v6nexthop; 3320 in_port_t dstport; 3321 uint32_t flowinfo; 3322 uint_t srcid; 3323 int is_absreq_failure = 0; 3324 conn_opt_arg_t coas, *coa; 3325 3326 ASSERT(tudr_mp != NULL || msg != NULL); 3327 3328 /* 3329 * Get ixa before checking state to handle a disconnect race. 3330 * 3331 * We need an exclusive copy of conn_ixa since the ancillary data 3332 * options might modify it. That copy has no pointers hence we 3333 * need to set them up once we've parsed the ancillary data. 3334 */ 3335 ixa = conn_get_ixa_exclusive(connp); 3336 if (ixa == NULL) { 3337 BUMP_MIB(&is->is_rawip_mib, rawipOutErrors); 3338 freemsg(mp); 3339 return (ENOMEM); 3340 } 3341 ASSERT(cr != NULL); 3342 ASSERT(!(ixa->ixa_free_flags & IXA_FREE_CRED)); 3343 ixa->ixa_cred = cr; 3344 ixa->ixa_cpid = pid; 3345 if (is_system_labeled()) { 3346 /* We need to restart with a label based on the cred */ 3347 ip_xmit_attr_restore_tsl(ixa, ixa->ixa_cred); 3348 } 3349 3350 /* In case previous destination was multicast or multirt */ 3351 ip_attr_newdst(ixa); 3352 3353 /* Get a copy of conn_xmit_ipp since the options might change it */ 3354 ipp = kmem_zalloc(sizeof (*ipp), KM_NOSLEEP); 3355 if (ipp == NULL) { 3356 ASSERT(!(ixa->ixa_free_flags & IXA_FREE_CRED)); 3357 ixa->ixa_cred = connp->conn_cred; /* Restore */ 3358 ixa->ixa_cpid = connp->conn_cpid; 3359 ixa_refrele(ixa); 3360 BUMP_MIB(&is->is_rawip_mib, rawipOutErrors); 3361 freemsg(mp); 3362 return (ENOMEM); 3363 } 3364 mutex_enter(&connp->conn_lock); 3365 error = ip_pkt_copy(&connp->conn_xmit_ipp, ipp, KM_NOSLEEP); 3366 mutex_exit(&connp->conn_lock); 3367 if (error != 0) { 3368 BUMP_MIB(&is->is_rawip_mib, rawipOutErrors); 3369 freemsg(mp); 3370 goto done; 3371 } 3372 3373 /* 3374 * Parse the options and update ixa and ipp as a result. 3375 */ 3376 3377 coa = &coas; 3378 coa->coa_connp = connp; 3379 coa->coa_ixa = ixa; 3380 coa->coa_ipp = ipp; 3381 coa->coa_ancillary = B_TRUE; 3382 coa->coa_changed = 0; 3383 3384 if (msg != NULL) { 3385 error = process_auxiliary_options(connp, msg->msg_control, 3386 msg->msg_controllen, coa, &icmp_opt_obj, icmp_opt_set, cr); 3387 } else { 3388 struct T_unitdata_req *tudr; 3389 3390 tudr = (struct T_unitdata_req *)tudr_mp->b_rptr; 3391 ASSERT(tudr->PRIM_type == T_UNITDATA_REQ); 3392 error = tpi_optcom_buf(connp->conn_wq, tudr_mp, 3393 &tudr->OPT_length, tudr->OPT_offset, cr, &icmp_opt_obj, 3394 coa, &is_absreq_failure); 3395 } 3396 if (error != 0) { 3397 /* 3398 * Note: No special action needed in this 3399 * module for "is_absreq_failure" 3400 */ 3401 freemsg(mp); 3402 BUMP_MIB(&is->is_rawip_mib, rawipOutErrors); 3403 goto done; 3404 } 3405 ASSERT(is_absreq_failure == 0); 3406 3407 mutex_enter(&connp->conn_lock); 3408 /* 3409 * If laddr is unspecified then we look at sin6_src_id. 3410 * We will give precedence to a source address set with IPV6_PKTINFO 3411 * (aka IPPF_ADDR) but that is handled in build_hdrs. However, we don't 3412 * want ip_attr_connect to select a source (since it can fail) when 3413 * IPV6_PKTINFO is specified. 3414 * If this doesn't result in a source address then we get a source 3415 * from ip_attr_connect() below. 3416 */ 3417 v6src = connp->conn_saddr_v6; 3418 if (sin != NULL) { 3419 IN6_IPADDR_TO_V4MAPPED(sin->sin_addr.s_addr, &v6dst); 3420 dstport = sin->sin_port; 3421 flowinfo = 0; 3422 ixa->ixa_flags &= ~IXAF_SCOPEID_SET; 3423 ixa->ixa_flags |= IXAF_IS_IPV4; 3424 } else if (sin6 != NULL) { 3425 v6dst = sin6->sin6_addr; 3426 dstport = sin6->sin6_port; 3427 flowinfo = sin6->sin6_flowinfo; 3428 srcid = sin6->__sin6_src_id; 3429 if (IN6_IS_ADDR_LINKSCOPE(&v6dst) && sin6->sin6_scope_id != 0) { 3430 ixa->ixa_scopeid = sin6->sin6_scope_id; 3431 ixa->ixa_flags |= IXAF_SCOPEID_SET; 3432 } else { 3433 ixa->ixa_flags &= ~IXAF_SCOPEID_SET; 3434 } 3435 if (srcid != 0 && IN6_IS_ADDR_UNSPECIFIED(&v6src)) { 3436 ip_srcid_find_id(srcid, &v6src, IPCL_ZONEID(connp), 3437 connp->conn_netstack); 3438 } 3439 if (IN6_IS_ADDR_V4MAPPED(&v6dst)) 3440 ixa->ixa_flags |= IXAF_IS_IPV4; 3441 else 3442 ixa->ixa_flags &= ~IXAF_IS_IPV4; 3443 } else { 3444 /* Connected case */ 3445 v6dst = connp->conn_faddr_v6; 3446 flowinfo = connp->conn_flowinfo; 3447 } 3448 mutex_exit(&connp->conn_lock); 3449 /* Handle IPV6_PKTINFO setting source address. */ 3450 if (IN6_IS_ADDR_UNSPECIFIED(&v6src) && 3451 (ipp->ipp_fields & IPPF_ADDR)) { 3452 if (ixa->ixa_flags & IXAF_IS_IPV4) { 3453 if (IN6_IS_ADDR_V4MAPPED(&ipp->ipp_addr)) 3454 v6src = ipp->ipp_addr; 3455 } else { 3456 if (!IN6_IS_ADDR_V4MAPPED(&ipp->ipp_addr)) 3457 v6src = ipp->ipp_addr; 3458 } 3459 } 3460 /* 3461 * Allow source not assigned to the system 3462 * only if it is not a local addresses 3463 */ 3464 if (!V6_OR_V4_INADDR_ANY(v6src)) { 3465 ip_laddr_t laddr_type; 3466 3467 if (ixa->ixa_flags & IXAF_IS_IPV4) { 3468 ipaddr_t v4src; 3469 3470 IN6_V4MAPPED_TO_IPADDR(&v6src, v4src); 3471 laddr_type = ip_laddr_verify_v4(v4src, ixa->ixa_zoneid, 3472 is->is_netstack->netstack_ip, B_FALSE); 3473 } else { 3474 laddr_type = ip_laddr_verify_v6(&v6src, ixa->ixa_zoneid, 3475 is->is_netstack->netstack_ip, B_FALSE, B_FALSE); 3476 } 3477 if (laddr_type != IPVL_UNICAST_UP) 3478 ixa->ixa_flags &= ~IXAF_VERIFY_SOURCE; 3479 } 3480 3481 ip_attr_nexthop(ipp, ixa, &v6dst, &v6nexthop); 3482 error = ip_attr_connect(connp, ixa, &v6src, &v6dst, &v6nexthop, dstport, 3483 &v6src, NULL, IPDF_ALLOW_MCBC | IPDF_VERIFY_DST); 3484 3485 switch (error) { 3486 case 0: 3487 break; 3488 case EADDRNOTAVAIL: 3489 /* 3490 * IXAF_VERIFY_SOURCE tells us to pick a better source. 3491 * Don't have the application see that errno 3492 */ 3493 error = ENETUNREACH; 3494 goto failed; 3495 case ENETDOWN: 3496 /* 3497 * Have !ipif_addr_ready address; drop packet silently 3498 * until we can get applications to not send until we 3499 * are ready. 3500 */ 3501 error = 0; 3502 goto failed; 3503 case EHOSTUNREACH: 3504 case ENETUNREACH: 3505 if (ixa->ixa_ire != NULL) { 3506 /* 3507 * Let conn_ip_output/ire_send_noroute return 3508 * the error and send any local ICMP error. 3509 */ 3510 error = 0; 3511 break; 3512 } 3513 /* FALLTHRU */ 3514 default: 3515 failed: 3516 freemsg(mp); 3517 BUMP_MIB(&is->is_rawip_mib, rawipOutErrors); 3518 goto done; 3519 } 3520 3521 /* 3522 * We might be going to a different destination than last time, 3523 * thus check that TX allows the communication and compute any 3524 * needed label. 3525 * 3526 * TSOL Note: We have an exclusive ipp and ixa for this thread so we 3527 * don't have to worry about concurrent threads. 3528 */ 3529 if (is_system_labeled()) { 3530 /* 3531 * Check whether Trusted Solaris policy allows communication 3532 * with this host, and pretend that the destination is 3533 * unreachable if not. 3534 * Compute any needed label and place it in ipp_label_v4/v6. 3535 * 3536 * Later conn_build_hdr_template/conn_prepend_hdr takes 3537 * ipp_label_v4/v6 to form the packet. 3538 * 3539 * Tsol note: We have ipp structure local to this thread so 3540 * no locking is needed. 3541 */ 3542 error = conn_update_label(connp, ixa, &v6dst, ipp); 3543 if (error != 0) { 3544 freemsg(mp); 3545 BUMP_MIB(&is->is_rawip_mib, rawipOutErrors); 3546 goto done; 3547 } 3548 } 3549 mp = icmp_prepend_hdr(connp, ixa, ipp, &v6src, &v6dst, flowinfo, mp, 3550 &error); 3551 if (mp == NULL) { 3552 BUMP_MIB(&is->is_rawip_mib, rawipOutErrors); 3553 ASSERT(error != 0); 3554 goto done; 3555 } 3556 if (ixa->ixa_pktlen > IP_MAXPACKET) { 3557 error = EMSGSIZE; 3558 BUMP_MIB(&is->is_rawip_mib, rawipOutErrors); 3559 freemsg(mp); 3560 goto done; 3561 } 3562 3563 /* Policy might differ for different ICMP type/code */ 3564 mp = icmp_output_attach_policy(mp, connp, ixa); 3565 if (mp == NULL) { 3566 BUMP_MIB(&is->is_rawip_mib, rawipOutErrors); 3567 error = EHOSTUNREACH; /* IPsec policy failure */ 3568 goto done; 3569 } 3570 3571 /* We're done. Pass the packet to ip. */ 3572 BUMP_MIB(&is->is_rawip_mib, rawipOutDatagrams); 3573 3574 error = conn_ip_output(mp, ixa); 3575 if (!connp->conn_unspec_src) 3576 ixa->ixa_flags |= IXAF_VERIFY_SOURCE; 3577 /* No rawipOutErrors if an error since IP increases its error counter */ 3578 switch (error) { 3579 case 0: 3580 break; 3581 case EWOULDBLOCK: 3582 (void) ixa_check_drain_insert(connp, ixa); 3583 error = 0; 3584 break; 3585 case EADDRNOTAVAIL: 3586 /* 3587 * IXAF_VERIFY_SOURCE tells us to pick a better source. 3588 * Don't have the application see that errno 3589 */ 3590 error = ENETUNREACH; 3591 /* FALLTHRU */ 3592 default: 3593 mutex_enter(&connp->conn_lock); 3594 /* 3595 * Clear the source and v6lastdst so we call ip_attr_connect 3596 * for the next packet and try to pick a better source. 3597 */ 3598 if (connp->conn_mcbc_bind) 3599 connp->conn_saddr_v6 = ipv6_all_zeros; 3600 else 3601 connp->conn_saddr_v6 = connp->conn_bound_addr_v6; 3602 connp->conn_v6lastdst = ipv6_all_zeros; 3603 mutex_exit(&connp->conn_lock); 3604 break; 3605 } 3606 done: 3607 ASSERT(!(ixa->ixa_free_flags & IXA_FREE_CRED)); 3608 ixa->ixa_cred = connp->conn_cred; /* Restore */ 3609 ixa->ixa_cpid = connp->conn_cpid; 3610 ixa_refrele(ixa); 3611 ip_pkt_free(ipp); 3612 kmem_free(ipp, sizeof (*ipp)); 3613 return (error); 3614 } 3615 3616 /* 3617 * Handle sending an M_DATA for a connected socket. 3618 * Handles both IPv4 and IPv6. 3619 */ 3620 int 3621 icmp_output_connected(conn_t *connp, mblk_t *mp, cred_t *cr, pid_t pid) 3622 { 3623 icmp_t *icmp = connp->conn_icmp; 3624 icmp_stack_t *is = icmp->icmp_is; 3625 int error; 3626 ip_xmit_attr_t *ixa; 3627 boolean_t do_ipsec; 3628 3629 /* 3630 * If no other thread is using conn_ixa this just gets a reference to 3631 * conn_ixa. Otherwise we get a safe copy of conn_ixa. 3632 */ 3633 ixa = conn_get_ixa(connp, B_FALSE); 3634 if (ixa == NULL) { 3635 BUMP_MIB(&is->is_rawip_mib, rawipOutErrors); 3636 freemsg(mp); 3637 return (ENOMEM); 3638 } 3639 3640 ASSERT(cr != NULL); 3641 ASSERT(!(ixa->ixa_free_flags & IXA_FREE_CRED)); 3642 ixa->ixa_cred = cr; 3643 ixa->ixa_cpid = pid; 3644 3645 /* Defer IPsec if it might need to look at ICMP type/code */ 3646 switch (ixa->ixa_protocol) { 3647 case IPPROTO_ICMP: 3648 case IPPROTO_ICMPV6: 3649 do_ipsec = B_FALSE; 3650 break; 3651 default: 3652 do_ipsec = B_TRUE; 3653 } 3654 3655 mutex_enter(&connp->conn_lock); 3656 mp = icmp_prepend_header_template(connp, ixa, mp, 3657 &connp->conn_saddr_v6, connp->conn_flowinfo, &error); 3658 3659 if (mp == NULL) { 3660 ASSERT(error != 0); 3661 mutex_exit(&connp->conn_lock); 3662 ASSERT(!(ixa->ixa_free_flags & IXA_FREE_CRED)); 3663 ixa->ixa_cred = connp->conn_cred; /* Restore */ 3664 ixa->ixa_cpid = connp->conn_cpid; 3665 ixa_refrele(ixa); 3666 BUMP_MIB(&is->is_rawip_mib, rawipOutErrors); 3667 freemsg(mp); 3668 return (error); 3669 } 3670 3671 if (!do_ipsec) { 3672 /* Policy might differ for different ICMP type/code */ 3673 mp = icmp_output_attach_policy(mp, connp, ixa); 3674 if (mp == NULL) { 3675 mutex_exit(&connp->conn_lock); 3676 BUMP_MIB(&is->is_rawip_mib, rawipOutErrors); 3677 ASSERT(!(ixa->ixa_free_flags & IXA_FREE_CRED)); 3678 ixa->ixa_cred = connp->conn_cred; /* Restore */ 3679 ixa->ixa_cpid = connp->conn_cpid; 3680 ixa_refrele(ixa); 3681 return (EHOSTUNREACH); /* IPsec policy failure */ 3682 } 3683 } 3684 3685 /* 3686 * In case we got a safe copy of conn_ixa, or if opt_set made us a new 3687 * safe copy, then we need to fill in any pointers in it. 3688 */ 3689 if (ixa->ixa_ire == NULL) { 3690 in6_addr_t faddr, saddr; 3691 in6_addr_t nexthop; 3692 in_port_t fport; 3693 3694 saddr = connp->conn_saddr_v6; 3695 faddr = connp->conn_faddr_v6; 3696 fport = connp->conn_fport; 3697 ip_attr_nexthop(&connp->conn_xmit_ipp, ixa, &faddr, &nexthop); 3698 mutex_exit(&connp->conn_lock); 3699 3700 error = ip_attr_connect(connp, ixa, &saddr, &faddr, &nexthop, 3701 fport, NULL, NULL, IPDF_ALLOW_MCBC | IPDF_VERIFY_DST | 3702 (do_ipsec ? IPDF_IPSEC : 0)); 3703 switch (error) { 3704 case 0: 3705 break; 3706 case EADDRNOTAVAIL: 3707 /* 3708 * IXAF_VERIFY_SOURCE tells us to pick a better source. 3709 * Don't have the application see that errno 3710 */ 3711 error = ENETUNREACH; 3712 goto failed; 3713 case ENETDOWN: 3714 /* 3715 * Have !ipif_addr_ready address; drop packet silently 3716 * until we can get applications to not send until we 3717 * are ready. 3718 */ 3719 error = 0; 3720 goto failed; 3721 case EHOSTUNREACH: 3722 case ENETUNREACH: 3723 if (ixa->ixa_ire != NULL) { 3724 /* 3725 * Let conn_ip_output/ire_send_noroute return 3726 * the error and send any local ICMP error. 3727 */ 3728 error = 0; 3729 break; 3730 } 3731 /* FALLTHRU */ 3732 default: 3733 failed: 3734 ASSERT(!(ixa->ixa_free_flags & IXA_FREE_CRED)); 3735 ixa->ixa_cred = connp->conn_cred; /* Restore */ 3736 ixa->ixa_cpid = connp->conn_cpid; 3737 ixa_refrele(ixa); 3738 BUMP_MIB(&is->is_rawip_mib, rawipOutErrors); 3739 freemsg(mp); 3740 return (error); 3741 } 3742 } else { 3743 /* Done with conn_t */ 3744 mutex_exit(&connp->conn_lock); 3745 } 3746 3747 /* We're done. Pass the packet to ip. */ 3748 BUMP_MIB(&is->is_rawip_mib, rawipOutDatagrams); 3749 3750 error = conn_ip_output(mp, ixa); 3751 /* No rawipOutErrors if an error since IP increases its error counter */ 3752 switch (error) { 3753 case 0: 3754 break; 3755 case EWOULDBLOCK: 3756 (void) ixa_check_drain_insert(connp, ixa); 3757 error = 0; 3758 break; 3759 case EADDRNOTAVAIL: 3760 /* 3761 * IXAF_VERIFY_SOURCE tells us to pick a better source. 3762 * Don't have the application see that errno 3763 */ 3764 error = ENETUNREACH; 3765 break; 3766 } 3767 ASSERT(!(ixa->ixa_free_flags & IXA_FREE_CRED)); 3768 ixa->ixa_cred = connp->conn_cred; /* Restore */ 3769 ixa->ixa_cpid = connp->conn_cpid; 3770 ixa_refrele(ixa); 3771 return (error); 3772 } 3773 3774 /* 3775 * Handle sending an M_DATA to the last destination. 3776 * Handles both IPv4 and IPv6. 3777 * 3778 * NOTE: The caller must hold conn_lock and we drop it here. 3779 */ 3780 int 3781 icmp_output_lastdst(conn_t *connp, mblk_t *mp, cred_t *cr, pid_t pid, 3782 ip_xmit_attr_t *ixa) 3783 { 3784 icmp_t *icmp = connp->conn_icmp; 3785 icmp_stack_t *is = icmp->icmp_is; 3786 int error; 3787 boolean_t do_ipsec; 3788 3789 ASSERT(MUTEX_HELD(&connp->conn_lock)); 3790 ASSERT(ixa != NULL); 3791 3792 ASSERT(cr != NULL); 3793 ASSERT(!(ixa->ixa_free_flags & IXA_FREE_CRED)); 3794 ixa->ixa_cred = cr; 3795 ixa->ixa_cpid = pid; 3796 3797 /* Defer IPsec if it might need to look at ICMP type/code */ 3798 switch (ixa->ixa_protocol) { 3799 case IPPROTO_ICMP: 3800 case IPPROTO_ICMPV6: 3801 do_ipsec = B_FALSE; 3802 break; 3803 default: 3804 do_ipsec = B_TRUE; 3805 } 3806 3807 3808 mp = icmp_prepend_header_template(connp, ixa, mp, 3809 &connp->conn_v6lastsrc, connp->conn_lastflowinfo, &error); 3810 3811 if (mp == NULL) { 3812 ASSERT(error != 0); 3813 mutex_exit(&connp->conn_lock); 3814 ASSERT(!(ixa->ixa_free_flags & IXA_FREE_CRED)); 3815 ixa->ixa_cred = connp->conn_cred; /* Restore */ 3816 ixa->ixa_cpid = connp->conn_cpid; 3817 ixa_refrele(ixa); 3818 BUMP_MIB(&is->is_rawip_mib, rawipOutErrors); 3819 freemsg(mp); 3820 return (error); 3821 } 3822 3823 if (!do_ipsec) { 3824 /* Policy might differ for different ICMP type/code */ 3825 mp = icmp_output_attach_policy(mp, connp, ixa); 3826 if (mp == NULL) { 3827 mutex_exit(&connp->conn_lock); 3828 BUMP_MIB(&is->is_rawip_mib, rawipOutErrors); 3829 ASSERT(!(ixa->ixa_free_flags & IXA_FREE_CRED)); 3830 ixa->ixa_cred = connp->conn_cred; /* Restore */ 3831 ixa->ixa_cpid = connp->conn_cpid; 3832 ixa_refrele(ixa); 3833 return (EHOSTUNREACH); /* IPsec policy failure */ 3834 } 3835 } 3836 3837 /* 3838 * In case we got a safe copy of conn_ixa, or if opt_set made us a new 3839 * safe copy, then we need to fill in any pointers in it. 3840 */ 3841 if (ixa->ixa_ire == NULL) { 3842 in6_addr_t lastdst, lastsrc; 3843 in6_addr_t nexthop; 3844 in_port_t lastport; 3845 3846 lastsrc = connp->conn_v6lastsrc; 3847 lastdst = connp->conn_v6lastdst; 3848 lastport = connp->conn_lastdstport; 3849 ip_attr_nexthop(&connp->conn_xmit_ipp, ixa, &lastdst, &nexthop); 3850 mutex_exit(&connp->conn_lock); 3851 3852 error = ip_attr_connect(connp, ixa, &lastsrc, &lastdst, 3853 &nexthop, lastport, NULL, NULL, IPDF_ALLOW_MCBC | 3854 IPDF_VERIFY_DST | (do_ipsec ? IPDF_IPSEC : 0)); 3855 switch (error) { 3856 case 0: 3857 break; 3858 case EADDRNOTAVAIL: 3859 /* 3860 * IXAF_VERIFY_SOURCE tells us to pick a better source. 3861 * Don't have the application see that errno 3862 */ 3863 error = ENETUNREACH; 3864 goto failed; 3865 case ENETDOWN: 3866 /* 3867 * Have !ipif_addr_ready address; drop packet silently 3868 * until we can get applications to not send until we 3869 * are ready. 3870 */ 3871 error = 0; 3872 goto failed; 3873 case EHOSTUNREACH: 3874 case ENETUNREACH: 3875 if (ixa->ixa_ire != NULL) { 3876 /* 3877 * Let conn_ip_output/ire_send_noroute return 3878 * the error and send any local ICMP error. 3879 */ 3880 error = 0; 3881 break; 3882 } 3883 /* FALLTHRU */ 3884 default: 3885 failed: 3886 ASSERT(!(ixa->ixa_free_flags & IXA_FREE_CRED)); 3887 ixa->ixa_cred = connp->conn_cred; /* Restore */ 3888 ixa->ixa_cpid = connp->conn_cpid; 3889 ixa_refrele(ixa); 3890 BUMP_MIB(&is->is_rawip_mib, rawipOutErrors); 3891 freemsg(mp); 3892 return (error); 3893 } 3894 } else { 3895 /* Done with conn_t */ 3896 mutex_exit(&connp->conn_lock); 3897 } 3898 3899 /* We're done. Pass the packet to ip. */ 3900 BUMP_MIB(&is->is_rawip_mib, rawipOutDatagrams); 3901 error = conn_ip_output(mp, ixa); 3902 /* No rawipOutErrors if an error since IP increases its error counter */ 3903 switch (error) { 3904 case 0: 3905 break; 3906 case EWOULDBLOCK: 3907 (void) ixa_check_drain_insert(connp, ixa); 3908 error = 0; 3909 break; 3910 case EADDRNOTAVAIL: 3911 /* 3912 * IXAF_VERIFY_SOURCE tells us to pick a better source. 3913 * Don't have the application see that errno 3914 */ 3915 error = ENETUNREACH; 3916 /* FALLTHRU */ 3917 default: 3918 mutex_enter(&connp->conn_lock); 3919 /* 3920 * Clear the source and v6lastdst so we call ip_attr_connect 3921 * for the next packet and try to pick a better source. 3922 */ 3923 if (connp->conn_mcbc_bind) 3924 connp->conn_saddr_v6 = ipv6_all_zeros; 3925 else 3926 connp->conn_saddr_v6 = connp->conn_bound_addr_v6; 3927 connp->conn_v6lastdst = ipv6_all_zeros; 3928 mutex_exit(&connp->conn_lock); 3929 break; 3930 } 3931 ASSERT(!(ixa->ixa_free_flags & IXA_FREE_CRED)); 3932 ixa->ixa_cred = connp->conn_cred; /* Restore */ 3933 ixa->ixa_cpid = connp->conn_cpid; 3934 ixa_refrele(ixa); 3935 return (error); 3936 } 3937 3938 3939 /* 3940 * Prepend the header template and then fill in the source and 3941 * flowinfo. The caller needs to handle the destination address since 3942 * it's setting is different if rthdr or source route. 3943 * 3944 * Returns NULL is allocation failed or if the packet would exceed IP_MAXPACKET. 3945 * When it returns NULL it sets errorp. 3946 */ 3947 static mblk_t * 3948 icmp_prepend_header_template(conn_t *connp, ip_xmit_attr_t *ixa, mblk_t *mp, 3949 const in6_addr_t *v6src, uint32_t flowinfo, int *errorp) 3950 { 3951 icmp_t *icmp = connp->conn_icmp; 3952 icmp_stack_t *is = icmp->icmp_is; 3953 uint_t pktlen; 3954 uint_t copylen; 3955 uint8_t *iph; 3956 uint_t ip_hdr_length; 3957 uint32_t cksum; 3958 ip_pkt_t *ipp; 3959 3960 ASSERT(MUTEX_HELD(&connp->conn_lock)); 3961 3962 /* 3963 * Copy the header template. 3964 */ 3965 copylen = connp->conn_ht_iphc_len; 3966 pktlen = copylen + msgdsize(mp); 3967 if (pktlen > IP_MAXPACKET) { 3968 freemsg(mp); 3969 *errorp = EMSGSIZE; 3970 return (NULL); 3971 } 3972 ixa->ixa_pktlen = pktlen; 3973 3974 /* check/fix buffer config, setup pointers into it */ 3975 iph = mp->b_rptr - copylen; 3976 if (DB_REF(mp) != 1 || iph < DB_BASE(mp) || !OK_32PTR(iph)) { 3977 mblk_t *mp1; 3978 3979 mp1 = allocb(copylen + is->is_wroff_extra, BPRI_MED); 3980 if (mp1 == NULL) { 3981 freemsg(mp); 3982 *errorp = ENOMEM; 3983 return (NULL); 3984 } 3985 mp1->b_wptr = DB_LIM(mp1); 3986 mp1->b_cont = mp; 3987 mp = mp1; 3988 iph = (mp->b_wptr - copylen); 3989 } 3990 mp->b_rptr = iph; 3991 bcopy(connp->conn_ht_iphc, iph, copylen); 3992 ip_hdr_length = (uint_t)(connp->conn_ht_ulp - connp->conn_ht_iphc); 3993 3994 ixa->ixa_ip_hdr_length = ip_hdr_length; 3995 3996 /* 3997 * Prepare for ICMPv6 checksum done in IP. 3998 * 3999 * icmp_build_hdr_template has already massaged any routing header 4000 * and placed the result in conn_sum. 4001 * 4002 * We make it easy for IP to include our pseudo header 4003 * by putting our length (and any routing header adjustment) 4004 * in the ICMPv6 checksum field. 4005 */ 4006 cksum = pktlen - ip_hdr_length; 4007 4008 cksum += connp->conn_sum; 4009 cksum = (cksum >> 16) + (cksum & 0xFFFF); 4010 ASSERT(cksum < 0x10000); 4011 4012 ipp = &connp->conn_xmit_ipp; 4013 if (ixa->ixa_flags & IXAF_IS_IPV4) { 4014 ipha_t *ipha = (ipha_t *)iph; 4015 4016 ipha->ipha_length = htons((uint16_t)pktlen); 4017 4018 /* if IP_PKTINFO specified an addres it wins over bind() */ 4019 if ((ipp->ipp_fields & IPPF_ADDR) && 4020 IN6_IS_ADDR_V4MAPPED(&ipp->ipp_addr)) { 4021 ASSERT(ipp->ipp_addr_v4 != INADDR_ANY); 4022 ipha->ipha_src = ipp->ipp_addr_v4; 4023 } else { 4024 IN6_V4MAPPED_TO_IPADDR(v6src, ipha->ipha_src); 4025 } 4026 } else { 4027 ip6_t *ip6h = (ip6_t *)iph; 4028 uint_t cksum_offset = 0; 4029 4030 ip6h->ip6_plen = htons((uint16_t)(pktlen - IPV6_HDR_LEN)); 4031 4032 /* if IP_PKTINFO specified an addres it wins over bind() */ 4033 if ((ipp->ipp_fields & IPPF_ADDR) && 4034 !IN6_IS_ADDR_V4MAPPED(&ipp->ipp_addr)) { 4035 ASSERT(!IN6_IS_ADDR_UNSPECIFIED(&ipp->ipp_addr)); 4036 ip6h->ip6_src = ipp->ipp_addr; 4037 } else { 4038 ip6h->ip6_src = *v6src; 4039 } 4040 ip6h->ip6_vcf = 4041 (IPV6_DEFAULT_VERS_AND_FLOW & IPV6_VERS_AND_FLOW_MASK) | 4042 (flowinfo & ~IPV6_VERS_AND_FLOW_MASK); 4043 if (ipp->ipp_fields & IPPF_TCLASS) { 4044 /* Overrides the class part of flowinfo */ 4045 ip6h->ip6_vcf = IPV6_TCLASS_FLOW(ip6h->ip6_vcf, 4046 ipp->ipp_tclass); 4047 } 4048 4049 if (ixa->ixa_flags & IXAF_SET_ULP_CKSUM) { 4050 if (connp->conn_proto == IPPROTO_ICMPV6) { 4051 cksum_offset = ixa->ixa_ip_hdr_length + 4052 offsetof(icmp6_t, icmp6_cksum); 4053 } else if (ixa->ixa_flags & IXAF_SET_RAW_CKSUM) { 4054 cksum_offset = ixa->ixa_ip_hdr_length + 4055 ixa->ixa_raw_cksum_offset; 4056 } 4057 } 4058 if (cksum_offset != 0) { 4059 uint16_t *ptr; 4060 4061 /* Make sure the checksum fits in the first mblk */ 4062 if (cksum_offset + sizeof (short) > MBLKL(mp)) { 4063 mblk_t *mp1; 4064 4065 mp1 = msgpullup(mp, 4066 cksum_offset + sizeof (short)); 4067 freemsg(mp); 4068 if (mp1 == NULL) { 4069 *errorp = ENOMEM; 4070 return (NULL); 4071 } 4072 mp = mp1; 4073 iph = mp->b_rptr; 4074 ip6h = (ip6_t *)iph; 4075 } 4076 ptr = (uint16_t *)(mp->b_rptr + cksum_offset); 4077 *ptr = htons(cksum); 4078 } 4079 } 4080 4081 return (mp); 4082 } 4083 4084 /* 4085 * This routine handles all messages passed downstream. It either 4086 * consumes the message or passes it downstream; it never queues a 4087 * a message. 4088 */ 4089 void 4090 icmp_wput(queue_t *q, mblk_t *mp) 4091 { 4092 sin6_t *sin6; 4093 sin_t *sin = NULL; 4094 uint_t srcid; 4095 conn_t *connp = Q_TO_CONN(q); 4096 icmp_t *icmp = connp->conn_icmp; 4097 int error = 0; 4098 struct sockaddr *addr = NULL; 4099 socklen_t addrlen; 4100 icmp_stack_t *is = icmp->icmp_is; 4101 struct T_unitdata_req *tudr; 4102 mblk_t *data_mp; 4103 cred_t *cr; 4104 pid_t pid; 4105 4106 /* 4107 * We directly handle several cases here: T_UNITDATA_REQ message 4108 * coming down as M_PROTO/M_PCPROTO and M_DATA messages for connected 4109 * socket. 4110 */ 4111 switch (DB_TYPE(mp)) { 4112 case M_DATA: 4113 /* sockfs never sends down M_DATA */ 4114 BUMP_MIB(&is->is_rawip_mib, rawipOutErrors); 4115 freemsg(mp); 4116 return; 4117 4118 case M_PROTO: 4119 case M_PCPROTO: 4120 tudr = (struct T_unitdata_req *)mp->b_rptr; 4121 if (MBLKL(mp) < sizeof (*tudr) || 4122 ((t_primp_t)mp->b_rptr)->type != T_UNITDATA_REQ) { 4123 icmp_wput_other(q, mp); 4124 return; 4125 } 4126 break; 4127 4128 default: 4129 icmp_wput_other(q, mp); 4130 return; 4131 } 4132 4133 /* Handle valid T_UNITDATA_REQ here */ 4134 data_mp = mp->b_cont; 4135 if (data_mp == NULL) { 4136 error = EPROTO; 4137 goto ud_error2; 4138 } 4139 mp->b_cont = NULL; 4140 4141 if (!MBLKIN(mp, 0, tudr->DEST_offset + tudr->DEST_length)) { 4142 error = EADDRNOTAVAIL; 4143 goto ud_error2; 4144 } 4145 4146 /* 4147 * All Solaris components should pass a db_credp 4148 * for this message, hence we ASSERT. 4149 * On production kernels we return an error to be robust against 4150 * random streams modules sitting on top of us. 4151 */ 4152 cr = msg_getcred(mp, &pid); 4153 ASSERT(cr != NULL); 4154 if (cr == NULL) { 4155 error = EINVAL; 4156 goto ud_error2; 4157 } 4158 4159 /* 4160 * If a port has not been bound to the stream, fail. 4161 * This is not a problem when sockfs is directly 4162 * above us, because it will ensure that the socket 4163 * is first bound before allowing data to be sent. 4164 */ 4165 if (icmp->icmp_state == TS_UNBND) { 4166 error = EPROTO; 4167 goto ud_error2; 4168 } 4169 addr = (struct sockaddr *)&mp->b_rptr[tudr->DEST_offset]; 4170 addrlen = tudr->DEST_length; 4171 4172 switch (connp->conn_family) { 4173 case AF_INET6: 4174 sin6 = (sin6_t *)addr; 4175 if (!OK_32PTR((char *)sin6) || (addrlen != sizeof (sin6_t)) || 4176 (sin6->sin6_family != AF_INET6)) { 4177 error = EADDRNOTAVAIL; 4178 goto ud_error2; 4179 } 4180 4181 /* No support for mapped addresses on raw sockets */ 4182 if (IN6_IS_ADDR_V4MAPPED(&sin6->sin6_addr)) { 4183 error = EADDRNOTAVAIL; 4184 goto ud_error2; 4185 } 4186 srcid = sin6->__sin6_src_id; 4187 4188 /* 4189 * If the local address is a mapped address return 4190 * an error. 4191 * It would be possible to send an IPv6 packet but the 4192 * response would never make it back to the application 4193 * since it is bound to a mapped address. 4194 */ 4195 if (IN6_IS_ADDR_V4MAPPED(&connp->conn_saddr_v6)) { 4196 error = EADDRNOTAVAIL; 4197 goto ud_error2; 4198 } 4199 4200 if (IN6_IS_ADDR_UNSPECIFIED(&sin6->sin6_addr)) 4201 sin6->sin6_addr = ipv6_loopback; 4202 4203 if (tudr->OPT_length != 0) { 4204 /* 4205 * If we are connected then the destination needs to be 4206 * the same as the connected one. 4207 */ 4208 if (icmp->icmp_state == TS_DATA_XFER && 4209 !conn_same_as_last_v6(connp, sin6)) { 4210 error = EISCONN; 4211 goto ud_error2; 4212 } 4213 error = icmp_output_ancillary(connp, NULL, sin6, 4214 data_mp, mp, NULL, cr, pid); 4215 } else { 4216 ip_xmit_attr_t *ixa; 4217 4218 /* 4219 * We have to allocate an ip_xmit_attr_t before we grab 4220 * conn_lock and we need to hold conn_lock once we've 4221 * checked conn_same_as_last_v6 to handle concurrent 4222 * send* calls on a socket. 4223 */ 4224 ixa = conn_get_ixa(connp, B_FALSE); 4225 if (ixa == NULL) { 4226 error = ENOMEM; 4227 goto ud_error2; 4228 } 4229 mutex_enter(&connp->conn_lock); 4230 4231 if (conn_same_as_last_v6(connp, sin6) && 4232 connp->conn_lastsrcid == srcid && 4233 ipsec_outbound_policy_current(ixa)) { 4234 /* icmp_output_lastdst drops conn_lock */ 4235 error = icmp_output_lastdst(connp, data_mp, cr, 4236 pid, ixa); 4237 } else { 4238 /* icmp_output_newdst drops conn_lock */ 4239 error = icmp_output_newdst(connp, data_mp, NULL, 4240 sin6, cr, pid, ixa); 4241 } 4242 ASSERT(MUTEX_NOT_HELD(&connp->conn_lock)); 4243 } 4244 if (error == 0) { 4245 freeb(mp); 4246 return; 4247 } 4248 break; 4249 4250 case AF_INET: 4251 sin = (sin_t *)addr; 4252 if ((!OK_32PTR((char *)sin) || addrlen != sizeof (sin_t)) || 4253 (sin->sin_family != AF_INET)) { 4254 error = EADDRNOTAVAIL; 4255 goto ud_error2; 4256 } 4257 if (sin->sin_addr.s_addr == INADDR_ANY) 4258 sin->sin_addr.s_addr = htonl(INADDR_LOOPBACK); 4259 4260 /* Protocol 255 contains full IP headers */ 4261 /* Read without holding lock */ 4262 if (icmp->icmp_hdrincl) { 4263 if (MBLKL(data_mp) < IP_SIMPLE_HDR_LENGTH) { 4264 if (!pullupmsg(data_mp, IP_SIMPLE_HDR_LENGTH)) { 4265 error = EINVAL; 4266 goto ud_error2; 4267 } 4268 } 4269 error = icmp_output_hdrincl(connp, data_mp, cr, pid); 4270 if (error == 0) { 4271 freeb(mp); 4272 return; 4273 } 4274 /* data_mp consumed above */ 4275 data_mp = NULL; 4276 goto ud_error2; 4277 } 4278 4279 if (tudr->OPT_length != 0) { 4280 /* 4281 * If we are connected then the destination needs to be 4282 * the same as the connected one. 4283 */ 4284 if (icmp->icmp_state == TS_DATA_XFER && 4285 !conn_same_as_last_v4(connp, sin)) { 4286 error = EISCONN; 4287 goto ud_error2; 4288 } 4289 error = icmp_output_ancillary(connp, sin, NULL, 4290 data_mp, mp, NULL, cr, pid); 4291 } else { 4292 ip_xmit_attr_t *ixa; 4293 4294 /* 4295 * We have to allocate an ip_xmit_attr_t before we grab 4296 * conn_lock and we need to hold conn_lock once we've 4297 * checked conn_same_as_last_v4 to handle concurrent 4298 * send* calls on a socket. 4299 */ 4300 ixa = conn_get_ixa(connp, B_FALSE); 4301 if (ixa == NULL) { 4302 error = ENOMEM; 4303 goto ud_error2; 4304 } 4305 mutex_enter(&connp->conn_lock); 4306 4307 if (conn_same_as_last_v4(connp, sin) && 4308 ipsec_outbound_policy_current(ixa)) { 4309 /* icmp_output_lastdst drops conn_lock */ 4310 error = icmp_output_lastdst(connp, data_mp, cr, 4311 pid, ixa); 4312 } else { 4313 /* icmp_output_newdst drops conn_lock */ 4314 error = icmp_output_newdst(connp, data_mp, sin, 4315 NULL, cr, pid, ixa); 4316 } 4317 ASSERT(MUTEX_NOT_HELD(&connp->conn_lock)); 4318 } 4319 if (error == 0) { 4320 freeb(mp); 4321 return; 4322 } 4323 break; 4324 } 4325 ASSERT(mp != NULL); 4326 /* mp is freed by the following routine */ 4327 icmp_ud_err(q, mp, (t_scalar_t)error); 4328 return; 4329 4330 ud_error2: 4331 BUMP_MIB(&is->is_rawip_mib, rawipOutErrors); 4332 freemsg(data_mp); 4333 ASSERT(mp != NULL); 4334 /* mp is freed by the following routine */ 4335 icmp_ud_err(q, mp, (t_scalar_t)error); 4336 } 4337 4338 /* 4339 * Handle the case of the IP address or flow label being different 4340 * for both IPv4 and IPv6. 4341 * 4342 * NOTE: The caller must hold conn_lock and we drop it here. 4343 */ 4344 static int 4345 icmp_output_newdst(conn_t *connp, mblk_t *data_mp, sin_t *sin, sin6_t *sin6, 4346 cred_t *cr, pid_t pid, ip_xmit_attr_t *ixa) 4347 { 4348 icmp_t *icmp = connp->conn_icmp; 4349 icmp_stack_t *is = icmp->icmp_is; 4350 int error; 4351 ip_xmit_attr_t *oldixa; 4352 boolean_t do_ipsec; 4353 uint_t srcid; 4354 uint32_t flowinfo; 4355 in6_addr_t v6src; 4356 in6_addr_t v6dst; 4357 in6_addr_t v6nexthop; 4358 in_port_t dstport; 4359 4360 ASSERT(MUTEX_HELD(&connp->conn_lock)); 4361 ASSERT(ixa != NULL); 4362 4363 /* 4364 * We hold conn_lock across all the use and modifications of 4365 * the conn_lastdst, conn_ixa, and conn_xmit_ipp to ensure that they 4366 * stay consistent. 4367 */ 4368 4369 ASSERT(cr != NULL); 4370 ASSERT(!(ixa->ixa_free_flags & IXA_FREE_CRED)); 4371 ixa->ixa_cred = cr; 4372 ixa->ixa_cpid = pid; 4373 if (is_system_labeled()) { 4374 /* We need to restart with a label based on the cred */ 4375 ip_xmit_attr_restore_tsl(ixa, ixa->ixa_cred); 4376 } 4377 /* 4378 * If we are connected then the destination needs to be the 4379 * same as the connected one, which is not the case here since we 4380 * checked for that above. 4381 */ 4382 if (icmp->icmp_state == TS_DATA_XFER) { 4383 mutex_exit(&connp->conn_lock); 4384 error = EISCONN; 4385 goto ud_error; 4386 } 4387 4388 /* In case previous destination was multicast or multirt */ 4389 ip_attr_newdst(ixa); 4390 4391 /* 4392 * If laddr is unspecified then we look at sin6_src_id. 4393 * We will give precedence to a source address set with IPV6_PKTINFO 4394 * (aka IPPF_ADDR) but that is handled in build_hdrs. However, we don't 4395 * want ip_attr_connect to select a source (since it can fail) when 4396 * IPV6_PKTINFO is specified. 4397 * If this doesn't result in a source address then we get a source 4398 * from ip_attr_connect() below. 4399 */ 4400 v6src = connp->conn_saddr_v6; 4401 if (sin != NULL) { 4402 IN6_IPADDR_TO_V4MAPPED(sin->sin_addr.s_addr, &v6dst); 4403 dstport = sin->sin_port; 4404 flowinfo = 0; 4405 srcid = 0; 4406 ixa->ixa_flags &= ~IXAF_SCOPEID_SET; 4407 if (srcid != 0 && V4_PART_OF_V6(&v6src) == INADDR_ANY) { 4408 ip_srcid_find_id(srcid, &v6src, IPCL_ZONEID(connp), 4409 connp->conn_netstack); 4410 } 4411 ixa->ixa_flags |= IXAF_IS_IPV4; 4412 } else { 4413 v6dst = sin6->sin6_addr; 4414 dstport = sin6->sin6_port; 4415 flowinfo = sin6->sin6_flowinfo; 4416 srcid = sin6->__sin6_src_id; 4417 if (IN6_IS_ADDR_LINKSCOPE(&v6dst) && sin6->sin6_scope_id != 0) { 4418 ixa->ixa_scopeid = sin6->sin6_scope_id; 4419 ixa->ixa_flags |= IXAF_SCOPEID_SET; 4420 } else { 4421 ixa->ixa_flags &= ~IXAF_SCOPEID_SET; 4422 } 4423 if (srcid != 0 && IN6_IS_ADDR_UNSPECIFIED(&v6src)) { 4424 ip_srcid_find_id(srcid, &v6src, IPCL_ZONEID(connp), 4425 connp->conn_netstack); 4426 } 4427 if (IN6_IS_ADDR_V4MAPPED(&v6dst)) 4428 ixa->ixa_flags |= IXAF_IS_IPV4; 4429 else 4430 ixa->ixa_flags &= ~IXAF_IS_IPV4; 4431 } 4432 /* Handle IPV6_PKTINFO setting source address. */ 4433 if (IN6_IS_ADDR_UNSPECIFIED(&v6src) && 4434 (connp->conn_xmit_ipp.ipp_fields & IPPF_ADDR)) { 4435 ip_pkt_t *ipp = &connp->conn_xmit_ipp; 4436 4437 if (ixa->ixa_flags & IXAF_IS_IPV4) { 4438 if (IN6_IS_ADDR_V4MAPPED(&ipp->ipp_addr)) 4439 v6src = ipp->ipp_addr; 4440 } else { 4441 if (!IN6_IS_ADDR_V4MAPPED(&ipp->ipp_addr)) 4442 v6src = ipp->ipp_addr; 4443 } 4444 } 4445 4446 /* Defer IPsec if it might need to look at ICMP type/code */ 4447 switch (ixa->ixa_protocol) { 4448 case IPPROTO_ICMP: 4449 case IPPROTO_ICMPV6: 4450 do_ipsec = B_FALSE; 4451 break; 4452 default: 4453 do_ipsec = B_TRUE; 4454 } 4455 4456 ip_attr_nexthop(&connp->conn_xmit_ipp, ixa, &v6dst, &v6nexthop); 4457 mutex_exit(&connp->conn_lock); 4458 4459 error = ip_attr_connect(connp, ixa, &v6src, &v6dst, &v6nexthop, dstport, 4460 &v6src, NULL, IPDF_ALLOW_MCBC | IPDF_VERIFY_DST | 4461 (do_ipsec ? IPDF_IPSEC : 0)); 4462 switch (error) { 4463 case 0: 4464 break; 4465 case EADDRNOTAVAIL: 4466 /* 4467 * IXAF_VERIFY_SOURCE tells us to pick a better source. 4468 * Don't have the application see that errno 4469 */ 4470 error = ENETUNREACH; 4471 goto failed; 4472 case ENETDOWN: 4473 /* 4474 * Have !ipif_addr_ready address; drop packet silently 4475 * until we can get applications to not send until we 4476 * are ready. 4477 */ 4478 error = 0; 4479 goto failed; 4480 case EHOSTUNREACH: 4481 case ENETUNREACH: 4482 if (ixa->ixa_ire != NULL) { 4483 /* 4484 * Let conn_ip_output/ire_send_noroute return 4485 * the error and send any local ICMP error. 4486 */ 4487 error = 0; 4488 break; 4489 } 4490 /* FALLTHRU */ 4491 default: 4492 failed: 4493 goto ud_error; 4494 } 4495 4496 mutex_enter(&connp->conn_lock); 4497 /* 4498 * While we dropped the lock some other thread might have connected 4499 * this socket. If so we bail out with EISCONN to ensure that the 4500 * connecting thread is the one that updates conn_ixa, conn_ht_* 4501 * and conn_*last*. 4502 */ 4503 if (icmp->icmp_state == TS_DATA_XFER) { 4504 mutex_exit(&connp->conn_lock); 4505 error = EISCONN; 4506 goto ud_error; 4507 } 4508 4509 /* 4510 * We need to rebuild the headers if 4511 * - we are labeling packets (could be different for different 4512 * destinations) 4513 * - we have a source route (or routing header) since we need to 4514 * massage that to get the pseudo-header checksum 4515 * - a socket option with COA_HEADER_CHANGED has been set which 4516 * set conn_v6lastdst to zero. 4517 * 4518 * Otherwise the prepend function will just update the src, dst, 4519 * and flow label. 4520 */ 4521 if (is_system_labeled()) { 4522 /* TX MLP requires SCM_UCRED and don't have that here */ 4523 if (connp->conn_mlp_type != mlptSingle) { 4524 mutex_exit(&connp->conn_lock); 4525 error = ECONNREFUSED; 4526 goto ud_error; 4527 } 4528 /* 4529 * Check whether Trusted Solaris policy allows communication 4530 * with this host, and pretend that the destination is 4531 * unreachable if not. 4532 * Compute any needed label and place it in ipp_label_v4/v6. 4533 * 4534 * Later conn_build_hdr_template/conn_prepend_hdr takes 4535 * ipp_label_v4/v6 to form the packet. 4536 * 4537 * Tsol note: Since we hold conn_lock we know no other 4538 * thread manipulates conn_xmit_ipp. 4539 */ 4540 error = conn_update_label(connp, ixa, &v6dst, 4541 &connp->conn_xmit_ipp); 4542 if (error != 0) { 4543 mutex_exit(&connp->conn_lock); 4544 goto ud_error; 4545 } 4546 /* Rebuild the header template */ 4547 error = icmp_build_hdr_template(connp, &v6src, &v6dst, 4548 flowinfo); 4549 if (error != 0) { 4550 mutex_exit(&connp->conn_lock); 4551 goto ud_error; 4552 } 4553 } else if (connp->conn_xmit_ipp.ipp_fields & 4554 (IPPF_IPV4_OPTIONS|IPPF_RTHDR) || 4555 IN6_IS_ADDR_UNSPECIFIED(&connp->conn_v6lastdst)) { 4556 /* Rebuild the header template */ 4557 error = icmp_build_hdr_template(connp, &v6src, &v6dst, 4558 flowinfo); 4559 if (error != 0) { 4560 mutex_exit(&connp->conn_lock); 4561 goto ud_error; 4562 } 4563 } else { 4564 /* Simply update the destination address if no source route */ 4565 if (ixa->ixa_flags & IXAF_IS_IPV4) { 4566 ipha_t *ipha = (ipha_t *)connp->conn_ht_iphc; 4567 4568 IN6_V4MAPPED_TO_IPADDR(&v6dst, ipha->ipha_dst); 4569 if (ixa->ixa_flags & IXAF_PMTU_IPV4_DF) { 4570 ipha->ipha_fragment_offset_and_flags |= 4571 IPH_DF_HTONS; 4572 } else { 4573 ipha->ipha_fragment_offset_and_flags &= 4574 ~IPH_DF_HTONS; 4575 } 4576 } else { 4577 ip6_t *ip6h = (ip6_t *)connp->conn_ht_iphc; 4578 ip6h->ip6_dst = v6dst; 4579 } 4580 } 4581 4582 /* 4583 * Remember the dst etc which corresponds to the built header 4584 * template and conn_ixa. 4585 */ 4586 oldixa = conn_replace_ixa(connp, ixa); 4587 connp->conn_v6lastdst = v6dst; 4588 connp->conn_lastflowinfo = flowinfo; 4589 connp->conn_lastscopeid = ixa->ixa_scopeid; 4590 connp->conn_lastsrcid = srcid; 4591 /* Also remember a source to use together with lastdst */ 4592 connp->conn_v6lastsrc = v6src; 4593 4594 data_mp = icmp_prepend_header_template(connp, ixa, data_mp, &v6src, 4595 flowinfo, &error); 4596 4597 /* Done with conn_t */ 4598 mutex_exit(&connp->conn_lock); 4599 ixa_refrele(oldixa); 4600 4601 if (data_mp == NULL) { 4602 ASSERT(error != 0); 4603 goto ud_error; 4604 } 4605 4606 if (!do_ipsec) { 4607 /* Policy might differ for different ICMP type/code */ 4608 data_mp = icmp_output_attach_policy(data_mp, connp, ixa); 4609 if (data_mp == NULL) { 4610 BUMP_MIB(&is->is_rawip_mib, rawipOutErrors); 4611 error = EHOSTUNREACH; /* IPsec policy failure */ 4612 goto done; 4613 } 4614 } 4615 4616 /* We're done. Pass the packet to ip. */ 4617 BUMP_MIB(&is->is_rawip_mib, rawipOutDatagrams); 4618 4619 error = conn_ip_output(data_mp, ixa); 4620 /* No rawipOutErrors if an error since IP increases its error counter */ 4621 switch (error) { 4622 case 0: 4623 break; 4624 case EWOULDBLOCK: 4625 (void) ixa_check_drain_insert(connp, ixa); 4626 error = 0; 4627 break; 4628 case EADDRNOTAVAIL: 4629 /* 4630 * IXAF_VERIFY_SOURCE tells us to pick a better source. 4631 * Don't have the application see that errno 4632 */ 4633 error = ENETUNREACH; 4634 /* FALLTHRU */ 4635 default: 4636 mutex_enter(&connp->conn_lock); 4637 /* 4638 * Clear the source and v6lastdst so we call ip_attr_connect 4639 * for the next packet and try to pick a better source. 4640 */ 4641 if (connp->conn_mcbc_bind) 4642 connp->conn_saddr_v6 = ipv6_all_zeros; 4643 else 4644 connp->conn_saddr_v6 = connp->conn_bound_addr_v6; 4645 connp->conn_v6lastdst = ipv6_all_zeros; 4646 mutex_exit(&connp->conn_lock); 4647 break; 4648 } 4649 done: 4650 ASSERT(!(ixa->ixa_free_flags & IXA_FREE_CRED)); 4651 ixa->ixa_cred = connp->conn_cred; /* Restore */ 4652 ixa->ixa_cpid = connp->conn_cpid; 4653 ixa_refrele(ixa); 4654 return (error); 4655 4656 ud_error: 4657 ASSERT(!(ixa->ixa_free_flags & IXA_FREE_CRED)); 4658 ixa->ixa_cred = connp->conn_cred; /* Restore */ 4659 ixa->ixa_cpid = connp->conn_cpid; 4660 ixa_refrele(ixa); 4661 4662 BUMP_MIB(&is->is_rawip_mib, rawipOutErrors); 4663 freemsg(data_mp); 4664 return (error); 4665 } 4666 4667 /* ARGSUSED */ 4668 static void 4669 icmp_wput_fallback(queue_t *q, mblk_t *mp) 4670 { 4671 #ifdef DEBUG 4672 cmn_err(CE_CONT, "icmp_wput_fallback: Message during fallback \n"); 4673 #endif 4674 freemsg(mp); 4675 } 4676 4677 static void 4678 icmp_wput_other(queue_t *q, mblk_t *mp) 4679 { 4680 uchar_t *rptr = mp->b_rptr; 4681 struct iocblk *iocp; 4682 conn_t *connp = Q_TO_CONN(q); 4683 icmp_t *icmp = connp->conn_icmp; 4684 cred_t *cr; 4685 4686 switch (mp->b_datap->db_type) { 4687 case M_PROTO: 4688 case M_PCPROTO: 4689 if (mp->b_wptr - rptr < sizeof (t_scalar_t)) { 4690 /* 4691 * If the message does not contain a PRIM_type, 4692 * throw it away. 4693 */ 4694 freemsg(mp); 4695 return; 4696 } 4697 switch (((t_primp_t)rptr)->type) { 4698 case T_ADDR_REQ: 4699 icmp_addr_req(q, mp); 4700 return; 4701 case O_T_BIND_REQ: 4702 case T_BIND_REQ: 4703 icmp_tpi_bind(q, mp); 4704 return; 4705 case T_CONN_REQ: 4706 icmp_tpi_connect(q, mp); 4707 return; 4708 case T_CAPABILITY_REQ: 4709 icmp_capability_req(q, mp); 4710 return; 4711 case T_INFO_REQ: 4712 icmp_info_req(q, mp); 4713 return; 4714 case T_UNITDATA_REQ: 4715 /* 4716 * If a T_UNITDATA_REQ gets here, the address must 4717 * be bad. Valid T_UNITDATA_REQs are handled 4718 * in icmp_wput. 4719 */ 4720 icmp_ud_err(q, mp, EADDRNOTAVAIL); 4721 return; 4722 case T_UNBIND_REQ: 4723 icmp_tpi_unbind(q, mp); 4724 return; 4725 case T_SVR4_OPTMGMT_REQ: 4726 /* 4727 * All Solaris components should pass a db_credp 4728 * for this TPI message, hence we ASSERT. 4729 * But in case there is some other M_PROTO that looks 4730 * like a TPI message sent by some other kernel 4731 * component, we check and return an error. 4732 */ 4733 cr = msg_getcred(mp, NULL); 4734 ASSERT(cr != NULL); 4735 if (cr == NULL) { 4736 icmp_err_ack(q, mp, TSYSERR, EINVAL); 4737 return; 4738 } 4739 4740 if (!snmpcom_req(q, mp, icmp_snmp_set, ip_snmp_get, 4741 cr)) { 4742 svr4_optcom_req(q, mp, cr, &icmp_opt_obj); 4743 } 4744 return; 4745 4746 case T_OPTMGMT_REQ: 4747 /* 4748 * All Solaris components should pass a db_credp 4749 * for this TPI message, hence we ASSERT. 4750 * But in case there is some other M_PROTO that looks 4751 * like a TPI message sent by some other kernel 4752 * component, we check and return an error. 4753 */ 4754 cr = msg_getcred(mp, NULL); 4755 ASSERT(cr != NULL); 4756 if (cr == NULL) { 4757 icmp_err_ack(q, mp, TSYSERR, EINVAL); 4758 return; 4759 } 4760 tpi_optcom_req(q, mp, cr, &icmp_opt_obj); 4761 return; 4762 4763 case T_DISCON_REQ: 4764 icmp_tpi_disconnect(q, mp); 4765 return; 4766 4767 /* The following TPI message is not supported by icmp. */ 4768 case O_T_CONN_RES: 4769 case T_CONN_RES: 4770 icmp_err_ack(q, mp, TNOTSUPPORT, 0); 4771 return; 4772 4773 /* The following 3 TPI requests are illegal for icmp. */ 4774 case T_DATA_REQ: 4775 case T_EXDATA_REQ: 4776 case T_ORDREL_REQ: 4777 icmp_err_ack(q, mp, TNOTSUPPORT, 0); 4778 return; 4779 default: 4780 break; 4781 } 4782 break; 4783 case M_FLUSH: 4784 if (*rptr & FLUSHW) 4785 flushq(q, FLUSHDATA); 4786 break; 4787 case M_IOCTL: 4788 iocp = (struct iocblk *)mp->b_rptr; 4789 switch (iocp->ioc_cmd) { 4790 case TI_GETPEERNAME: 4791 if (icmp->icmp_state != TS_DATA_XFER) { 4792 /* 4793 * If a default destination address has not 4794 * been associated with the stream, then we 4795 * don't know the peer's name. 4796 */ 4797 iocp->ioc_error = ENOTCONN; 4798 iocp->ioc_count = 0; 4799 mp->b_datap->db_type = M_IOCACK; 4800 qreply(q, mp); 4801 return; 4802 } 4803 /* FALLTHRU */ 4804 case TI_GETMYNAME: 4805 /* 4806 * For TI_GETPEERNAME and TI_GETMYNAME, we first 4807 * need to copyin the user's strbuf structure. 4808 * Processing will continue in the M_IOCDATA case 4809 * below. 4810 */ 4811 mi_copyin(q, mp, NULL, 4812 SIZEOF_STRUCT(strbuf, iocp->ioc_flag)); 4813 return; 4814 default: 4815 break; 4816 } 4817 break; 4818 case M_IOCDATA: 4819 icmp_wput_iocdata(q, mp); 4820 return; 4821 default: 4822 /* Unrecognized messages are passed through without change. */ 4823 break; 4824 } 4825 ip_wput_nondata(q, mp); 4826 } 4827 4828 /* 4829 * icmp_wput_iocdata is called by icmp_wput_other to handle all M_IOCDATA 4830 * messages. 4831 */ 4832 static void 4833 icmp_wput_iocdata(queue_t *q, mblk_t *mp) 4834 { 4835 mblk_t *mp1; 4836 STRUCT_HANDLE(strbuf, sb); 4837 uint_t addrlen; 4838 conn_t *connp = Q_TO_CONN(q); 4839 icmp_t *icmp = connp->conn_icmp; 4840 4841 /* Make sure it is one of ours. */ 4842 switch (((struct iocblk *)mp->b_rptr)->ioc_cmd) { 4843 case TI_GETMYNAME: 4844 case TI_GETPEERNAME: 4845 break; 4846 default: 4847 ip_wput_nondata(q, mp); 4848 return; 4849 } 4850 4851 switch (mi_copy_state(q, mp, &mp1)) { 4852 case -1: 4853 return; 4854 case MI_COPY_CASE(MI_COPY_IN, 1): 4855 break; 4856 case MI_COPY_CASE(MI_COPY_OUT, 1): 4857 /* 4858 * The address has been copied out, so now 4859 * copyout the strbuf. 4860 */ 4861 mi_copyout(q, mp); 4862 return; 4863 case MI_COPY_CASE(MI_COPY_OUT, 2): 4864 /* 4865 * The address and strbuf have been copied out. 4866 * We're done, so just acknowledge the original 4867 * M_IOCTL. 4868 */ 4869 mi_copy_done(q, mp, 0); 4870 return; 4871 default: 4872 /* 4873 * Something strange has happened, so acknowledge 4874 * the original M_IOCTL with an EPROTO error. 4875 */ 4876 mi_copy_done(q, mp, EPROTO); 4877 return; 4878 } 4879 4880 /* 4881 * Now we have the strbuf structure for TI_GETMYNAME 4882 * and TI_GETPEERNAME. Next we copyout the requested 4883 * address and then we'll copyout the strbuf. 4884 */ 4885 STRUCT_SET_HANDLE(sb, ((struct iocblk *)mp->b_rptr)->ioc_flag, 4886 (void *)mp1->b_rptr); 4887 4888 if (connp->conn_family == AF_INET) 4889 addrlen = sizeof (sin_t); 4890 else 4891 addrlen = sizeof (sin6_t); 4892 4893 if (STRUCT_FGET(sb, maxlen) < addrlen) { 4894 mi_copy_done(q, mp, EINVAL); 4895 return; 4896 } 4897 switch (((struct iocblk *)mp->b_rptr)->ioc_cmd) { 4898 case TI_GETMYNAME: 4899 break; 4900 case TI_GETPEERNAME: 4901 if (icmp->icmp_state != TS_DATA_XFER) { 4902 mi_copy_done(q, mp, ENOTCONN); 4903 return; 4904 } 4905 break; 4906 default: 4907 mi_copy_done(q, mp, EPROTO); 4908 return; 4909 } 4910 mp1 = mi_copyout_alloc(q, mp, STRUCT_FGETP(sb, buf), addrlen, B_TRUE); 4911 if (!mp1) 4912 return; 4913 4914 STRUCT_FSET(sb, len, addrlen); 4915 switch (((struct iocblk *)mp->b_rptr)->ioc_cmd) { 4916 case TI_GETMYNAME: 4917 (void) conn_getsockname(connp, (struct sockaddr *)mp1->b_wptr, 4918 &addrlen); 4919 break; 4920 case TI_GETPEERNAME: 4921 (void) conn_getpeername(connp, (struct sockaddr *)mp1->b_wptr, 4922 &addrlen); 4923 break; 4924 } 4925 mp1->b_wptr += addrlen; 4926 /* Copy out the address */ 4927 mi_copyout(q, mp); 4928 } 4929 4930 void 4931 icmp_ddi_g_init(void) 4932 { 4933 icmp_max_optsize = optcom_max_optsize(icmp_opt_obj.odb_opt_des_arr, 4934 icmp_opt_obj.odb_opt_arr_cnt); 4935 4936 /* 4937 * We want to be informed each time a stack is created or 4938 * destroyed in the kernel, so we can maintain the 4939 * set of icmp_stack_t's. 4940 */ 4941 netstack_register(NS_ICMP, rawip_stack_init, NULL, rawip_stack_fini); 4942 } 4943 4944 void 4945 icmp_ddi_g_destroy(void) 4946 { 4947 netstack_unregister(NS_ICMP); 4948 } 4949 4950 #define INET_NAME "ip" 4951 4952 /* 4953 * Initialize the ICMP stack instance. 4954 */ 4955 static void * 4956 rawip_stack_init(netstackid_t stackid, netstack_t *ns) 4957 { 4958 icmp_stack_t *is; 4959 int error = 0; 4960 size_t arrsz; 4961 major_t major; 4962 4963 is = (icmp_stack_t *)kmem_zalloc(sizeof (*is), KM_SLEEP); 4964 is->is_netstack = ns; 4965 4966 arrsz = sizeof (icmp_propinfo_tbl); 4967 is->is_propinfo_tbl = (mod_prop_info_t *)kmem_alloc(arrsz, KM_SLEEP); 4968 bcopy(icmp_propinfo_tbl, is->is_propinfo_tbl, arrsz); 4969 4970 is->is_ksp = rawip_kstat_init(stackid); 4971 4972 major = mod_name_to_major(INET_NAME); 4973 error = ldi_ident_from_major(major, &is->is_ldi_ident); 4974 ASSERT(error == 0); 4975 return (is); 4976 } 4977 4978 /* 4979 * Free the ICMP stack instance. 4980 */ 4981 static void 4982 rawip_stack_fini(netstackid_t stackid, void *arg) 4983 { 4984 icmp_stack_t *is = (icmp_stack_t *)arg; 4985 4986 kmem_free(is->is_propinfo_tbl, sizeof (icmp_propinfo_tbl)); 4987 is->is_propinfo_tbl = NULL; 4988 4989 rawip_kstat_fini(stackid, is->is_ksp); 4990 is->is_ksp = NULL; 4991 ldi_ident_release(is->is_ldi_ident); 4992 kmem_free(is, sizeof (*is)); 4993 } 4994 4995 static void * 4996 rawip_kstat_init(netstackid_t stackid) { 4997 kstat_t *ksp; 4998 4999 rawip_named_kstat_t template = { 5000 { "inDatagrams", KSTAT_DATA_UINT32, 0 }, 5001 { "inCksumErrs", KSTAT_DATA_UINT32, 0 }, 5002 { "inErrors", KSTAT_DATA_UINT32, 0 }, 5003 { "outDatagrams", KSTAT_DATA_UINT32, 0 }, 5004 { "outErrors", KSTAT_DATA_UINT32, 0 }, 5005 }; 5006 5007 ksp = kstat_create_netstack("icmp", 0, "rawip", "mib2", 5008 KSTAT_TYPE_NAMED, 5009 NUM_OF_FIELDS(rawip_named_kstat_t), 5010 0, stackid); 5011 if (ksp == NULL || ksp->ks_data == NULL) 5012 return (NULL); 5013 5014 bcopy(&template, ksp->ks_data, sizeof (template)); 5015 ksp->ks_update = rawip_kstat_update; 5016 ksp->ks_private = (void *)(uintptr_t)stackid; 5017 5018 kstat_install(ksp); 5019 return (ksp); 5020 } 5021 5022 static void 5023 rawip_kstat_fini(netstackid_t stackid, kstat_t *ksp) 5024 { 5025 if (ksp != NULL) { 5026 ASSERT(stackid == (netstackid_t)(uintptr_t)ksp->ks_private); 5027 kstat_delete_netstack(ksp, stackid); 5028 } 5029 } 5030 5031 static int 5032 rawip_kstat_update(kstat_t *ksp, int rw) 5033 { 5034 rawip_named_kstat_t *rawipkp; 5035 netstackid_t stackid = (netstackid_t)(uintptr_t)ksp->ks_private; 5036 netstack_t *ns; 5037 icmp_stack_t *is; 5038 5039 if ((ksp == NULL) || (ksp->ks_data == NULL)) 5040 return (EIO); 5041 5042 if (rw == KSTAT_WRITE) 5043 return (EACCES); 5044 5045 rawipkp = (rawip_named_kstat_t *)ksp->ks_data; 5046 5047 ns = netstack_find_by_stackid(stackid); 5048 if (ns == NULL) 5049 return (-1); 5050 is = ns->netstack_icmp; 5051 if (is == NULL) { 5052 netstack_rele(ns); 5053 return (-1); 5054 } 5055 rawipkp->inDatagrams.value.ui32 = is->is_rawip_mib.rawipInDatagrams; 5056 rawipkp->inCksumErrs.value.ui32 = is->is_rawip_mib.rawipInCksumErrs; 5057 rawipkp->inErrors.value.ui32 = is->is_rawip_mib.rawipInErrors; 5058 rawipkp->outDatagrams.value.ui32 = is->is_rawip_mib.rawipOutDatagrams; 5059 rawipkp->outErrors.value.ui32 = is->is_rawip_mib.rawipOutErrors; 5060 netstack_rele(ns); 5061 return (0); 5062 } 5063 5064 /* ARGSUSED */ 5065 int 5066 rawip_accept(sock_lower_handle_t lproto_handle, 5067 sock_lower_handle_t eproto_handle, sock_upper_handle_t sock_handle, 5068 cred_t *cr) 5069 { 5070 return (EOPNOTSUPP); 5071 } 5072 5073 /* ARGSUSED */ 5074 int 5075 rawip_bind(sock_lower_handle_t proto_handle, struct sockaddr *sa, 5076 socklen_t len, cred_t *cr) 5077 { 5078 conn_t *connp = (conn_t *)proto_handle; 5079 int error; 5080 5081 /* All Solaris components should pass a cred for this operation. */ 5082 ASSERT(cr != NULL); 5083 5084 /* Binding to a NULL address really means unbind */ 5085 if (sa == NULL) 5086 error = rawip_do_unbind(connp); 5087 else 5088 error = rawip_do_bind(connp, sa, len); 5089 5090 if (error < 0) { 5091 if (error == -TOUTSTATE) 5092 error = EINVAL; 5093 else 5094 error = proto_tlitosyserr(-error); 5095 } 5096 return (error); 5097 } 5098 5099 static int 5100 rawip_implicit_bind(conn_t *connp) 5101 { 5102 sin6_t sin6addr; 5103 sin_t *sin; 5104 sin6_t *sin6; 5105 socklen_t len; 5106 int error; 5107 5108 if (connp->conn_family == AF_INET) { 5109 len = sizeof (struct sockaddr_in); 5110 sin = (sin_t *)&sin6addr; 5111 *sin = sin_null; 5112 sin->sin_family = AF_INET; 5113 sin->sin_addr.s_addr = INADDR_ANY; 5114 } else { 5115 ASSERT(connp->conn_family == AF_INET6); 5116 len = sizeof (sin6_t); 5117 sin6 = (sin6_t *)&sin6addr; 5118 *sin6 = sin6_null; 5119 sin6->sin6_family = AF_INET6; 5120 V6_SET_ZERO(sin6->sin6_addr); 5121 } 5122 5123 error = rawip_do_bind(connp, (struct sockaddr *)&sin6addr, len); 5124 5125 return ((error < 0) ? proto_tlitosyserr(-error) : error); 5126 } 5127 5128 static int 5129 rawip_unbind(conn_t *connp) 5130 { 5131 int error; 5132 5133 error = rawip_do_unbind(connp); 5134 if (error < 0) { 5135 error = proto_tlitosyserr(-error); 5136 } 5137 return (error); 5138 } 5139 5140 /* ARGSUSED */ 5141 int 5142 rawip_listen(sock_lower_handle_t proto_handle, int backlog, cred_t *cr) 5143 { 5144 return (EOPNOTSUPP); 5145 } 5146 5147 int 5148 rawip_connect(sock_lower_handle_t proto_handle, const struct sockaddr *sa, 5149 socklen_t len, sock_connid_t *id, cred_t *cr) 5150 { 5151 conn_t *connp = (conn_t *)proto_handle; 5152 icmp_t *icmp = connp->conn_icmp; 5153 int error; 5154 boolean_t did_bind = B_FALSE; 5155 pid_t pid = curproc->p_pid; 5156 5157 /* All Solaris components should pass a cred for this operation. */ 5158 ASSERT(cr != NULL); 5159 5160 if (sa == NULL) { 5161 /* 5162 * Disconnect 5163 * Make sure we are connected 5164 */ 5165 if (icmp->icmp_state != TS_DATA_XFER) 5166 return (EINVAL); 5167 5168 error = icmp_disconnect(connp); 5169 return (error); 5170 } 5171 5172 error = proto_verify_ip_addr(connp->conn_family, sa, len); 5173 if (error != 0) 5174 return (error); 5175 5176 /* do an implicit bind if necessary */ 5177 if (icmp->icmp_state == TS_UNBND) { 5178 error = rawip_implicit_bind(connp); 5179 /* 5180 * We could be racing with an actual bind, in which case 5181 * we would see EPROTO. We cross our fingers and try 5182 * to connect. 5183 */ 5184 if (!(error == 0 || error == EPROTO)) 5185 return (error); 5186 did_bind = B_TRUE; 5187 } 5188 5189 /* 5190 * set SO_DGRAM_ERRIND 5191 */ 5192 connp->conn_dgram_errind = B_TRUE; 5193 5194 error = rawip_do_connect(connp, sa, len, cr, pid); 5195 if (error != 0 && did_bind) { 5196 int unbind_err; 5197 5198 unbind_err = rawip_unbind(connp); 5199 ASSERT(unbind_err == 0); 5200 } 5201 5202 if (error == 0) { 5203 *id = 0; 5204 (*connp->conn_upcalls->su_connected)(connp->conn_upper_handle, 5205 0, NULL, -1); 5206 } else if (error < 0) { 5207 error = proto_tlitosyserr(-error); 5208 } 5209 return (error); 5210 } 5211 5212 /* ARGSUSED2 */ 5213 int 5214 rawip_fallback(sock_lower_handle_t proto_handle, queue_t *q, 5215 boolean_t direct_sockfs, so_proto_quiesced_cb_t quiesced_cb) 5216 { 5217 conn_t *connp = (conn_t *)proto_handle; 5218 icmp_t *icmp; 5219 struct T_capability_ack tca; 5220 struct sockaddr_in6 laddr, faddr; 5221 socklen_t laddrlen, faddrlen; 5222 short opts; 5223 struct stroptions *stropt; 5224 mblk_t *stropt_mp; 5225 int error; 5226 5227 icmp = connp->conn_icmp; 5228 5229 stropt_mp = allocb_wait(sizeof (*stropt), BPRI_HI, STR_NOSIG, NULL); 5230 5231 /* 5232 * setup the fallback stream that was allocated 5233 */ 5234 connp->conn_dev = (dev_t)RD(q)->q_ptr; 5235 connp->conn_minor_arena = WR(q)->q_ptr; 5236 5237 RD(q)->q_ptr = WR(q)->q_ptr = connp; 5238 5239 WR(q)->q_qinfo = &icmpwinit; 5240 5241 connp->conn_rq = RD(q); 5242 connp->conn_wq = WR(q); 5243 5244 /* Notify stream head about options before sending up data */ 5245 stropt_mp->b_datap->db_type = M_SETOPTS; 5246 stropt_mp->b_wptr += sizeof (*stropt); 5247 stropt = (struct stroptions *)stropt_mp->b_rptr; 5248 stropt->so_flags = SO_WROFF | SO_HIWAT; 5249 stropt->so_wroff = connp->conn_wroff; 5250 stropt->so_hiwat = connp->conn_rcvbuf; 5251 putnext(RD(q), stropt_mp); 5252 5253 /* 5254 * free helper stream 5255 */ 5256 ip_free_helper_stream(connp); 5257 5258 /* 5259 * Collect the information needed to sync with the sonode 5260 */ 5261 icmp_do_capability_ack(icmp, &tca, TC1_INFO); 5262 5263 laddrlen = faddrlen = sizeof (sin6_t); 5264 (void) rawip_getsockname((sock_lower_handle_t)connp, 5265 (struct sockaddr *)&laddr, &laddrlen, CRED()); 5266 error = rawip_getpeername((sock_lower_handle_t)connp, 5267 (struct sockaddr *)&faddr, &faddrlen, CRED()); 5268 if (error != 0) 5269 faddrlen = 0; 5270 opts = 0; 5271 if (connp->conn_dgram_errind) 5272 opts |= SO_DGRAM_ERRIND; 5273 if (connp->conn_ixa->ixa_flags & IXAF_DONTROUTE) 5274 opts |= SO_DONTROUTE; 5275 5276 (*quiesced_cb)(connp->conn_upper_handle, q, &tca, 5277 (struct sockaddr *)&laddr, laddrlen, 5278 (struct sockaddr *)&faddr, faddrlen, opts); 5279 5280 /* 5281 * Attempts to send data up during fallback will result in it being 5282 * queued in icmp_t. Now we push up any queued packets. 5283 */ 5284 mutex_enter(&icmp->icmp_recv_lock); 5285 while (icmp->icmp_fallback_queue_head != NULL) { 5286 mblk_t *mp; 5287 5288 mp = icmp->icmp_fallback_queue_head; 5289 icmp->icmp_fallback_queue_head = mp->b_next; 5290 mp->b_next = NULL; 5291 mutex_exit(&icmp->icmp_recv_lock); 5292 putnext(RD(q), mp); 5293 mutex_enter(&icmp->icmp_recv_lock); 5294 } 5295 icmp->icmp_fallback_queue_tail = icmp->icmp_fallback_queue_head; 5296 5297 /* 5298 * No longer a streams less socket 5299 */ 5300 mutex_enter(&connp->conn_lock); 5301 connp->conn_flags &= ~IPCL_NONSTR; 5302 mutex_exit(&connp->conn_lock); 5303 5304 mutex_exit(&icmp->icmp_recv_lock); 5305 5306 ASSERT(icmp->icmp_fallback_queue_head == NULL && 5307 icmp->icmp_fallback_queue_tail == NULL); 5308 5309 ASSERT(connp->conn_ref >= 1); 5310 5311 return (0); 5312 } 5313 5314 /* ARGSUSED2 */ 5315 sock_lower_handle_t 5316 rawip_create(int family, int type, int proto, sock_downcalls_t **sock_downcalls, 5317 uint_t *smodep, int *errorp, int flags, cred_t *credp) 5318 { 5319 conn_t *connp; 5320 5321 if (type != SOCK_RAW || (family != AF_INET && family != AF_INET6)) { 5322 *errorp = EPROTONOSUPPORT; 5323 return (NULL); 5324 } 5325 5326 connp = rawip_do_open(family, credp, errorp, flags); 5327 if (connp != NULL) { 5328 connp->conn_flags |= IPCL_NONSTR; 5329 5330 mutex_enter(&connp->conn_lock); 5331 connp->conn_state_flags &= ~CONN_INCIPIENT; 5332 mutex_exit(&connp->conn_lock); 5333 *sock_downcalls = &sock_rawip_downcalls; 5334 *smodep = SM_ATOMIC; 5335 } else { 5336 ASSERT(*errorp != 0); 5337 } 5338 5339 return ((sock_lower_handle_t)connp); 5340 } 5341 5342 /* ARGSUSED3 */ 5343 void 5344 rawip_activate(sock_lower_handle_t proto_handle, 5345 sock_upper_handle_t sock_handle, sock_upcalls_t *sock_upcalls, int flags, 5346 cred_t *cr) 5347 { 5348 conn_t *connp = (conn_t *)proto_handle; 5349 struct sock_proto_props sopp; 5350 5351 /* All Solaris components should pass a cred for this operation. */ 5352 ASSERT(cr != NULL); 5353 5354 connp->conn_upcalls = sock_upcalls; 5355 connp->conn_upper_handle = sock_handle; 5356 5357 sopp.sopp_flags = SOCKOPT_WROFF | SOCKOPT_RCVHIWAT | SOCKOPT_RCVLOWAT | 5358 SOCKOPT_MAXBLK | SOCKOPT_MAXPSZ | SOCKOPT_MINPSZ; 5359 sopp.sopp_wroff = connp->conn_wroff; 5360 sopp.sopp_rxhiwat = connp->conn_rcvbuf; 5361 sopp.sopp_rxlowat = connp->conn_rcvlowat; 5362 sopp.sopp_maxblk = INFPSZ; 5363 sopp.sopp_maxpsz = IP_MAXPACKET; 5364 sopp.sopp_minpsz = (icmp_mod_info.mi_minpsz == 1) ? 0 : 5365 icmp_mod_info.mi_minpsz; 5366 5367 (*connp->conn_upcalls->su_set_proto_props) 5368 (connp->conn_upper_handle, &sopp); 5369 5370 icmp_bind_proto(connp->conn_icmp); 5371 } 5372 5373 /* ARGSUSED3 */ 5374 int 5375 rawip_getpeername(sock_lower_handle_t proto_handle, struct sockaddr *sa, 5376 socklen_t *salenp, cred_t *cr) 5377 { 5378 conn_t *connp = (conn_t *)proto_handle; 5379 icmp_t *icmp = connp->conn_icmp; 5380 int error; 5381 5382 /* All Solaris components should pass a cred for this operation. */ 5383 ASSERT(cr != NULL); 5384 5385 mutex_enter(&connp->conn_lock); 5386 if (icmp->icmp_state != TS_DATA_XFER) 5387 error = ENOTCONN; 5388 else 5389 error = conn_getpeername(connp, sa, salenp); 5390 mutex_exit(&connp->conn_lock); 5391 return (error); 5392 } 5393 5394 /* ARGSUSED3 */ 5395 int 5396 rawip_getsockname(sock_lower_handle_t proto_handle, struct sockaddr *sa, 5397 socklen_t *salenp, cred_t *cr) 5398 { 5399 conn_t *connp = (conn_t *)proto_handle; 5400 int error; 5401 5402 /* All Solaris components should pass a cred for this operation. */ 5403 ASSERT(cr != NULL); 5404 5405 mutex_enter(&connp->conn_lock); 5406 error = conn_getsockname(connp, sa, salenp); 5407 mutex_exit(&connp->conn_lock); 5408 return (error); 5409 } 5410 5411 int 5412 rawip_setsockopt(sock_lower_handle_t proto_handle, int level, int option_name, 5413 const void *optvalp, socklen_t optlen, cred_t *cr) 5414 { 5415 conn_t *connp = (conn_t *)proto_handle; 5416 int error; 5417 5418 /* All Solaris components should pass a cred for this operation. */ 5419 ASSERT(cr != NULL); 5420 5421 error = proto_opt_check(level, option_name, optlen, NULL, 5422 icmp_opt_obj.odb_opt_des_arr, 5423 icmp_opt_obj.odb_opt_arr_cnt, 5424 B_TRUE, B_FALSE, cr); 5425 5426 if (error != 0) { 5427 /* 5428 * option not recognized 5429 */ 5430 if (error < 0) { 5431 error = proto_tlitosyserr(-error); 5432 } 5433 return (error); 5434 } 5435 5436 error = icmp_opt_set(connp, SETFN_OPTCOM_NEGOTIATE, level, 5437 option_name, optlen, (uchar_t *)optvalp, (uint_t *)&optlen, 5438 (uchar_t *)optvalp, NULL, cr); 5439 5440 ASSERT(error >= 0); 5441 5442 return (error); 5443 } 5444 5445 int 5446 rawip_getsockopt(sock_lower_handle_t proto_handle, int level, int option_name, 5447 void *optvalp, socklen_t *optlen, cred_t *cr) 5448 { 5449 int error; 5450 conn_t *connp = (conn_t *)proto_handle; 5451 t_uscalar_t max_optbuf_len; 5452 void *optvalp_buf; 5453 int len; 5454 5455 /* All Solaris components should pass a cred for this operation. */ 5456 ASSERT(cr != NULL); 5457 5458 error = proto_opt_check(level, option_name, *optlen, &max_optbuf_len, 5459 icmp_opt_obj.odb_opt_des_arr, 5460 icmp_opt_obj.odb_opt_arr_cnt, 5461 B_FALSE, B_TRUE, cr); 5462 5463 if (error != 0) { 5464 if (error < 0) { 5465 error = proto_tlitosyserr(-error); 5466 } 5467 return (error); 5468 } 5469 5470 optvalp_buf = kmem_alloc(max_optbuf_len, KM_SLEEP); 5471 len = icmp_opt_get(connp, level, option_name, optvalp_buf); 5472 if (len == -1) { 5473 kmem_free(optvalp_buf, max_optbuf_len); 5474 return (EINVAL); 5475 } 5476 5477 /* 5478 * update optlen and copy option value 5479 */ 5480 t_uscalar_t size = MIN(len, *optlen); 5481 5482 bcopy(optvalp_buf, optvalp, size); 5483 bcopy(&size, optlen, sizeof (size)); 5484 5485 kmem_free(optvalp_buf, max_optbuf_len); 5486 return (0); 5487 } 5488 5489 /* ARGSUSED1 */ 5490 int 5491 rawip_close(sock_lower_handle_t proto_handle, int flags, cred_t *cr) 5492 { 5493 conn_t *connp = (conn_t *)proto_handle; 5494 5495 /* All Solaris components should pass a cred for this operation. */ 5496 ASSERT(cr != NULL); 5497 5498 (void) rawip_do_close(connp); 5499 return (0); 5500 } 5501 5502 /* ARGSUSED2 */ 5503 int 5504 rawip_shutdown(sock_lower_handle_t proto_handle, int how, cred_t *cr) 5505 { 5506 conn_t *connp = (conn_t *)proto_handle; 5507 5508 /* All Solaris components should pass a cred for this operation. */ 5509 ASSERT(cr != NULL); 5510 5511 /* shut down the send side */ 5512 if (how != SHUT_RD) 5513 (*connp->conn_upcalls->su_opctl)(connp->conn_upper_handle, 5514 SOCK_OPCTL_SHUT_SEND, 0); 5515 /* shut down the recv side */ 5516 if (how != SHUT_WR) 5517 (*connp->conn_upcalls->su_opctl)(connp->conn_upper_handle, 5518 SOCK_OPCTL_SHUT_RECV, 0); 5519 return (0); 5520 } 5521 5522 void 5523 rawip_clr_flowctrl(sock_lower_handle_t proto_handle) 5524 { 5525 conn_t *connp = (conn_t *)proto_handle; 5526 icmp_t *icmp = connp->conn_icmp; 5527 5528 mutex_enter(&icmp->icmp_recv_lock); 5529 connp->conn_flow_cntrld = B_FALSE; 5530 mutex_exit(&icmp->icmp_recv_lock); 5531 } 5532 5533 int 5534 rawip_ioctl(sock_lower_handle_t proto_handle, int cmd, intptr_t arg, 5535 int mode, int32_t *rvalp, cred_t *cr) 5536 { 5537 conn_t *connp = (conn_t *)proto_handle; 5538 int error; 5539 5540 /* All Solaris components should pass a cred for this operation. */ 5541 ASSERT(cr != NULL); 5542 5543 /* 5544 * If we don't have a helper stream then create one. 5545 * ip_create_helper_stream takes care of locking the conn_t, 5546 * so this check for NULL is just a performance optimization. 5547 */ 5548 if (connp->conn_helper_info == NULL) { 5549 icmp_stack_t *is = connp->conn_icmp->icmp_is; 5550 5551 ASSERT(is->is_ldi_ident != NULL); 5552 5553 /* 5554 * Create a helper stream for non-STREAMS socket. 5555 */ 5556 error = ip_create_helper_stream(connp, is->is_ldi_ident); 5557 if (error != 0) { 5558 ip0dbg(("rawip_ioctl: create of IP helper stream " 5559 "failed %d\n", error)); 5560 return (error); 5561 } 5562 } 5563 5564 switch (cmd) { 5565 case _SIOCSOCKFALLBACK: 5566 case TI_GETPEERNAME: 5567 case TI_GETMYNAME: 5568 #ifdef DEBUG 5569 cmn_err(CE_CONT, "icmp_ioctl cmd 0x%x on non streams" 5570 " socket", cmd); 5571 #endif 5572 error = EINVAL; 5573 break; 5574 default: 5575 /* 5576 * Pass on to IP using helper stream 5577 */ 5578 error = ldi_ioctl(connp->conn_helper_info->iphs_handle, 5579 cmd, arg, mode, cr, rvalp); 5580 break; 5581 } 5582 return (error); 5583 } 5584 5585 int 5586 rawip_send(sock_lower_handle_t proto_handle, mblk_t *mp, struct nmsghdr *msg, 5587 cred_t *cr) 5588 { 5589 sin6_t *sin6; 5590 sin_t *sin = NULL; 5591 uint_t srcid; 5592 conn_t *connp = (conn_t *)proto_handle; 5593 icmp_t *icmp = connp->conn_icmp; 5594 int error = 0; 5595 icmp_stack_t *is = icmp->icmp_is; 5596 pid_t pid = curproc->p_pid; 5597 ip_xmit_attr_t *ixa; 5598 5599 ASSERT(DB_TYPE(mp) == M_DATA); 5600 5601 /* All Solaris components should pass a cred for this operation. */ 5602 ASSERT(cr != NULL); 5603 5604 /* do an implicit bind if necessary */ 5605 if (icmp->icmp_state == TS_UNBND) { 5606 error = rawip_implicit_bind(connp); 5607 /* 5608 * We could be racing with an actual bind, in which case 5609 * we would see EPROTO. We cross our fingers and try 5610 * to connect. 5611 */ 5612 if (!(error == 0 || error == EPROTO)) { 5613 freemsg(mp); 5614 return (error); 5615 } 5616 } 5617 5618 /* Protocol 255 contains full IP headers */ 5619 /* Read without holding lock */ 5620 if (icmp->icmp_hdrincl) { 5621 ASSERT(connp->conn_ipversion == IPV4_VERSION); 5622 if (mp->b_wptr - mp->b_rptr < IP_SIMPLE_HDR_LENGTH) { 5623 if (!pullupmsg(mp, IP_SIMPLE_HDR_LENGTH)) { 5624 BUMP_MIB(&is->is_rawip_mib, rawipOutErrors); 5625 freemsg(mp); 5626 return (EINVAL); 5627 } 5628 } 5629 error = icmp_output_hdrincl(connp, mp, cr, pid); 5630 if (is->is_sendto_ignerr) 5631 return (0); 5632 else 5633 return (error); 5634 } 5635 5636 /* Connected? */ 5637 if (msg->msg_name == NULL) { 5638 if (icmp->icmp_state != TS_DATA_XFER) { 5639 BUMP_MIB(&is->is_rawip_mib, rawipOutErrors); 5640 return (EDESTADDRREQ); 5641 } 5642 if (msg->msg_controllen != 0) { 5643 error = icmp_output_ancillary(connp, NULL, NULL, mp, 5644 NULL, msg, cr, pid); 5645 } else { 5646 error = icmp_output_connected(connp, mp, cr, pid); 5647 } 5648 if (is->is_sendto_ignerr) 5649 return (0); 5650 else 5651 return (error); 5652 } 5653 if (icmp->icmp_state == TS_DATA_XFER) { 5654 BUMP_MIB(&is->is_rawip_mib, rawipOutErrors); 5655 return (EISCONN); 5656 } 5657 error = proto_verify_ip_addr(connp->conn_family, 5658 (struct sockaddr *)msg->msg_name, msg->msg_namelen); 5659 if (error != 0) { 5660 BUMP_MIB(&is->is_rawip_mib, rawipOutErrors); 5661 return (error); 5662 } 5663 switch (connp->conn_family) { 5664 case AF_INET6: 5665 sin6 = (sin6_t *)msg->msg_name; 5666 5667 /* No support for mapped addresses on raw sockets */ 5668 if (IN6_IS_ADDR_V4MAPPED(&sin6->sin6_addr)) { 5669 BUMP_MIB(&is->is_rawip_mib, rawipOutErrors); 5670 return (EADDRNOTAVAIL); 5671 } 5672 srcid = sin6->__sin6_src_id; 5673 5674 /* 5675 * If the local address is a mapped address return 5676 * an error. 5677 * It would be possible to send an IPv6 packet but the 5678 * response would never make it back to the application 5679 * since it is bound to a mapped address. 5680 */ 5681 if (IN6_IS_ADDR_V4MAPPED(&connp->conn_saddr_v6)) { 5682 BUMP_MIB(&is->is_rawip_mib, rawipOutErrors); 5683 return (EADDRNOTAVAIL); 5684 } 5685 5686 if (IN6_IS_ADDR_UNSPECIFIED(&sin6->sin6_addr)) 5687 sin6->sin6_addr = ipv6_loopback; 5688 5689 /* 5690 * We have to allocate an ip_xmit_attr_t before we grab 5691 * conn_lock and we need to hold conn_lock once we've check 5692 * conn_same_as_last_v6 to handle concurrent send* calls on a 5693 * socket. 5694 */ 5695 if (msg->msg_controllen == 0) { 5696 ixa = conn_get_ixa(connp, B_FALSE); 5697 if (ixa == NULL) { 5698 BUMP_MIB(&is->is_rawip_mib, rawipOutErrors); 5699 return (ENOMEM); 5700 } 5701 } else { 5702 ixa = NULL; 5703 } 5704 mutex_enter(&connp->conn_lock); 5705 if (icmp->icmp_delayed_error != 0) { 5706 sin6_t *sin2 = (sin6_t *)&icmp->icmp_delayed_addr; 5707 5708 error = icmp->icmp_delayed_error; 5709 icmp->icmp_delayed_error = 0; 5710 5711 /* Compare IP address and family */ 5712 5713 if (IN6_ARE_ADDR_EQUAL(&sin6->sin6_addr, 5714 &sin2->sin6_addr) && 5715 sin6->sin6_family == sin2->sin6_family) { 5716 mutex_exit(&connp->conn_lock); 5717 BUMP_MIB(&is->is_rawip_mib, rawipOutErrors); 5718 if (ixa != NULL) 5719 ixa_refrele(ixa); 5720 return (error); 5721 } 5722 } 5723 if (msg->msg_controllen != 0) { 5724 mutex_exit(&connp->conn_lock); 5725 ASSERT(ixa == NULL); 5726 error = icmp_output_ancillary(connp, NULL, sin6, mp, 5727 NULL, msg, cr, pid); 5728 } else if (conn_same_as_last_v6(connp, sin6) && 5729 connp->conn_lastsrcid == srcid && 5730 ipsec_outbound_policy_current(ixa)) { 5731 /* icmp_output_lastdst drops conn_lock */ 5732 error = icmp_output_lastdst(connp, mp, cr, pid, ixa); 5733 } else { 5734 /* icmp_output_newdst drops conn_lock */ 5735 error = icmp_output_newdst(connp, mp, NULL, sin6, cr, 5736 pid, ixa); 5737 } 5738 ASSERT(MUTEX_NOT_HELD(&connp->conn_lock)); 5739 if (is->is_sendto_ignerr) 5740 return (0); 5741 else 5742 return (error); 5743 case AF_INET: 5744 sin = (sin_t *)msg->msg_name; 5745 5746 if (sin->sin_addr.s_addr == INADDR_ANY) 5747 sin->sin_addr.s_addr = htonl(INADDR_LOOPBACK); 5748 5749 /* 5750 * We have to allocate an ip_xmit_attr_t before we grab 5751 * conn_lock and we need to hold conn_lock once we've check 5752 * conn_same_as_last_v6 to handle concurrent send* on a socket. 5753 */ 5754 if (msg->msg_controllen == 0) { 5755 ixa = conn_get_ixa(connp, B_FALSE); 5756 if (ixa == NULL) { 5757 BUMP_MIB(&is->is_rawip_mib, rawipOutErrors); 5758 return (ENOMEM); 5759 } 5760 } else { 5761 ixa = NULL; 5762 } 5763 mutex_enter(&connp->conn_lock); 5764 if (icmp->icmp_delayed_error != 0) { 5765 sin_t *sin2 = (sin_t *)&icmp->icmp_delayed_addr; 5766 5767 error = icmp->icmp_delayed_error; 5768 icmp->icmp_delayed_error = 0; 5769 5770 /* Compare IP address */ 5771 5772 if (sin->sin_addr.s_addr == sin2->sin_addr.s_addr) { 5773 mutex_exit(&connp->conn_lock); 5774 BUMP_MIB(&is->is_rawip_mib, rawipOutErrors); 5775 if (ixa != NULL) 5776 ixa_refrele(ixa); 5777 return (error); 5778 } 5779 } 5780 5781 if (msg->msg_controllen != 0) { 5782 mutex_exit(&connp->conn_lock); 5783 ASSERT(ixa == NULL); 5784 error = icmp_output_ancillary(connp, sin, NULL, mp, 5785 NULL, msg, cr, pid); 5786 } else if (conn_same_as_last_v4(connp, sin) && 5787 ipsec_outbound_policy_current(ixa)) { 5788 /* icmp_output_lastdst drops conn_lock */ 5789 error = icmp_output_lastdst(connp, mp, cr, pid, ixa); 5790 } else { 5791 /* icmp_output_newdst drops conn_lock */ 5792 error = icmp_output_newdst(connp, mp, sin, NULL, cr, 5793 pid, ixa); 5794 } 5795 ASSERT(MUTEX_NOT_HELD(&connp->conn_lock)); 5796 if (is->is_sendto_ignerr) 5797 return (0); 5798 else 5799 return (error); 5800 default: 5801 return (EINVAL); 5802 } 5803 } 5804 5805 sock_downcalls_t sock_rawip_downcalls = { 5806 rawip_activate, 5807 rawip_accept, 5808 rawip_bind, 5809 rawip_listen, 5810 rawip_connect, 5811 rawip_getpeername, 5812 rawip_getsockname, 5813 rawip_getsockopt, 5814 rawip_setsockopt, 5815 rawip_send, 5816 NULL, 5817 NULL, 5818 NULL, 5819 rawip_shutdown, 5820 rawip_clr_flowctrl, 5821 rawip_ioctl, 5822 rawip_close 5823 }; 5824