1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright (c) 1991, 2010, Oracle and/or its affiliates. All rights reserved. 23 */ 24 /* Copyright (c) 1990 Mentat Inc. */ 25 26 #include <sys/types.h> 27 #include <sys/stream.h> 28 #include <sys/stropts.h> 29 #include <sys/strlog.h> 30 #include <sys/strsun.h> 31 #define _SUN_TPI_VERSION 2 32 #include <sys/tihdr.h> 33 #include <sys/timod.h> 34 #include <sys/ddi.h> 35 #include <sys/sunddi.h> 36 #include <sys/strsubr.h> 37 #include <sys/suntpi.h> 38 #include <sys/xti_inet.h> 39 #include <sys/cmn_err.h> 40 #include <sys/kmem.h> 41 #include <sys/cred.h> 42 #include <sys/policy.h> 43 #include <sys/priv.h> 44 #include <sys/ucred.h> 45 #include <sys/zone.h> 46 47 #include <sys/sockio.h> 48 #include <sys/socket.h> 49 #include <sys/socketvar.h> 50 #include <sys/vtrace.h> 51 #include <sys/sdt.h> 52 #include <sys/debug.h> 53 #include <sys/isa_defs.h> 54 #include <sys/random.h> 55 #include <netinet/in.h> 56 #include <netinet/ip6.h> 57 #include <netinet/icmp6.h> 58 #include <netinet/udp.h> 59 60 #include <inet/common.h> 61 #include <inet/ip.h> 62 #include <inet/ip_impl.h> 63 #include <inet/ipsec_impl.h> 64 #include <inet/ip6.h> 65 #include <inet/ip_ire.h> 66 #include <inet/ip_if.h> 67 #include <inet/ip_multi.h> 68 #include <inet/ip_ndp.h> 69 #include <inet/proto_set.h> 70 #include <inet/mib2.h> 71 #include <inet/nd.h> 72 #include <inet/optcom.h> 73 #include <inet/snmpcom.h> 74 #include <inet/kstatcom.h> 75 #include <inet/ipclassifier.h> 76 77 #include <sys/tsol/label.h> 78 #include <sys/tsol/tnet.h> 79 80 #include <inet/rawip_impl.h> 81 82 #include <sys/disp.h> 83 84 /* 85 * Synchronization notes: 86 * 87 * RAWIP is MT and uses the usual kernel synchronization primitives. We use 88 * conn_lock to protect the icmp_t. 89 * 90 * Plumbing notes: 91 * ICMP is always a device driver. For compatibility with mibopen() code 92 * it is possible to I_PUSH "icmp", but that results in pushing a passthrough 93 * dummy module. 94 */ 95 96 static void icmp_addr_req(queue_t *q, mblk_t *mp); 97 static void icmp_tpi_bind(queue_t *q, mblk_t *mp); 98 static void icmp_bind_proto(icmp_t *icmp); 99 static int icmp_build_hdr_template(conn_t *, const in6_addr_t *, 100 const in6_addr_t *, uint32_t); 101 static void icmp_capability_req(queue_t *q, mblk_t *mp); 102 static int icmp_close(queue_t *q, int flags); 103 static void icmp_close_free(conn_t *); 104 static void icmp_tpi_connect(queue_t *q, mblk_t *mp); 105 static void icmp_tpi_disconnect(queue_t *q, mblk_t *mp); 106 static void icmp_err_ack(queue_t *q, mblk_t *mp, t_scalar_t t_error, 107 int sys_error); 108 static void icmp_err_ack_prim(queue_t *q, mblk_t *mp, t_scalar_t primitive, 109 t_scalar_t tlierr, int sys_error); 110 static void icmp_icmp_input(void *arg1, mblk_t *mp, void *arg2, 111 ip_recv_attr_t *); 112 static void icmp_icmp_error_ipv6(conn_t *connp, mblk_t *mp, 113 ip_recv_attr_t *); 114 static void icmp_info_req(queue_t *q, mblk_t *mp); 115 static void icmp_input(void *, mblk_t *, void *, ip_recv_attr_t *); 116 static conn_t *icmp_open(int family, cred_t *credp, int *err, int flags); 117 static int icmp_openv4(queue_t *q, dev_t *devp, int flag, int sflag, 118 cred_t *credp); 119 static int icmp_openv6(queue_t *q, dev_t *devp, int flag, int sflag, 120 cred_t *credp); 121 static boolean_t icmp_opt_allow_udr_set(t_scalar_t level, t_scalar_t name); 122 int icmp_opt_set(conn_t *connp, uint_t optset_context, 123 int level, int name, uint_t inlen, 124 uchar_t *invalp, uint_t *outlenp, uchar_t *outvalp, 125 void *thisdg_attrs, cred_t *cr); 126 int icmp_opt_get(conn_t *connp, int level, int name, 127 uchar_t *ptr); 128 static int icmp_output_newdst(conn_t *connp, mblk_t *data_mp, sin_t *sin, 129 sin6_t *sin6, cred_t *cr, pid_t pid, ip_xmit_attr_t *ixa); 130 static mblk_t *icmp_prepend_hdr(conn_t *, ip_xmit_attr_t *, const ip_pkt_t *, 131 const in6_addr_t *, const in6_addr_t *, uint32_t, mblk_t *, int *); 132 static mblk_t *icmp_prepend_header_template(conn_t *, ip_xmit_attr_t *, 133 mblk_t *, const in6_addr_t *, uint32_t, int *); 134 static int icmp_snmp_set(queue_t *q, t_scalar_t level, t_scalar_t name, 135 uchar_t *ptr, int len); 136 static void icmp_ud_err(queue_t *q, mblk_t *mp, t_scalar_t err); 137 static void icmp_tpi_unbind(queue_t *q, mblk_t *mp); 138 static void icmp_wput(queue_t *q, mblk_t *mp); 139 static void icmp_wput_fallback(queue_t *q, mblk_t *mp); 140 static void icmp_wput_other(queue_t *q, mblk_t *mp); 141 static void icmp_wput_iocdata(queue_t *q, mblk_t *mp); 142 static void icmp_wput_restricted(queue_t *q, mblk_t *mp); 143 static void icmp_ulp_recv(conn_t *, mblk_t *, uint_t); 144 145 static void *rawip_stack_init(netstackid_t stackid, netstack_t *ns); 146 static void rawip_stack_fini(netstackid_t stackid, void *arg); 147 148 static void *rawip_kstat_init(netstackid_t stackid); 149 static void rawip_kstat_fini(netstackid_t stackid, kstat_t *ksp); 150 static int rawip_kstat_update(kstat_t *kp, int rw); 151 static void rawip_stack_shutdown(netstackid_t stackid, void *arg); 152 153 /* Common routines for TPI and socket module */ 154 static conn_t *rawip_do_open(int, cred_t *, int *, int); 155 static void rawip_do_close(conn_t *); 156 static int rawip_do_bind(conn_t *, struct sockaddr *, socklen_t); 157 static int rawip_do_unbind(conn_t *); 158 static int rawip_do_connect(conn_t *, const struct sockaddr *, socklen_t, 159 cred_t *, pid_t); 160 161 int rawip_getsockname(sock_lower_handle_t, struct sockaddr *, 162 socklen_t *, cred_t *); 163 int rawip_getpeername(sock_lower_handle_t, struct sockaddr *, 164 socklen_t *, cred_t *); 165 166 static struct module_info icmp_mod_info = { 167 5707, "icmp", 1, INFPSZ, 512, 128 168 }; 169 170 /* 171 * Entry points for ICMP as a device. 172 * We have separate open functions for the /dev/icmp and /dev/icmp6 devices. 173 */ 174 static struct qinit icmprinitv4 = { 175 NULL, NULL, icmp_openv4, icmp_close, NULL, &icmp_mod_info 176 }; 177 178 static struct qinit icmprinitv6 = { 179 NULL, NULL, icmp_openv6, icmp_close, NULL, &icmp_mod_info 180 }; 181 182 static struct qinit icmpwinit = { 183 (pfi_t)icmp_wput, (pfi_t)ip_wsrv, NULL, NULL, NULL, &icmp_mod_info 184 }; 185 186 /* ICMP entry point during fallback */ 187 static struct qinit icmp_fallback_sock_winit = { 188 (pfi_t)icmp_wput_fallback, NULL, NULL, NULL, NULL, &icmp_mod_info 189 }; 190 191 /* For AF_INET aka /dev/icmp */ 192 struct streamtab icmpinfov4 = { 193 &icmprinitv4, &icmpwinit 194 }; 195 196 /* For AF_INET6 aka /dev/icmp6 */ 197 struct streamtab icmpinfov6 = { 198 &icmprinitv6, &icmpwinit 199 }; 200 201 /* Default structure copied into T_INFO_ACK messages */ 202 static struct T_info_ack icmp_g_t_info_ack = { 203 T_INFO_ACK, 204 IP_MAXPACKET, /* TSDU_size. icmp allows maximum size messages. */ 205 T_INVALID, /* ETSDU_size. icmp does not support expedited data. */ 206 T_INVALID, /* CDATA_size. icmp does not support connect data. */ 207 T_INVALID, /* DDATA_size. icmp does not support disconnect data. */ 208 0, /* ADDR_size - filled in later. */ 209 0, /* OPT_size - not initialized here */ 210 IP_MAXPACKET, /* TIDU_size. icmp allows maximum size messages. */ 211 T_CLTS, /* SERV_type. icmp supports connection-less. */ 212 TS_UNBND, /* CURRENT_state. This is set from icmp_state. */ 213 (XPG4_1|SENDZERO) /* PROVIDER_flag */ 214 }; 215 216 /* 217 * All of these are alterable, within the min/max values given, at run time. 218 * 219 * Note: All those tunables which do not start with "icmp_" are Committed and 220 * therefore are public. See PSARC 2009/306. 221 */ 222 static mod_prop_info_t icmp_propinfo_tbl[] = { 223 /* tunable - 0 */ 224 { "icmp_wroff_extra", MOD_PROTO_RAWIP, 225 mod_set_uint32, mod_get_uint32, 226 {0, 128, 32}, {32} }, 227 228 { "icmp_ipv4_ttl", MOD_PROTO_RAWIP, 229 mod_set_uint32, mod_get_uint32, 230 {1, 255, 255}, {255} }, 231 232 { "icmp_ipv6_hoplimit", MOD_PROTO_RAWIP, 233 mod_set_uint32, mod_get_uint32, 234 {0, IPV6_MAX_HOPS, IPV6_DEFAULT_HOPS}, 235 {IPV6_DEFAULT_HOPS} }, 236 237 { "icmp_bsd_compat", MOD_PROTO_RAWIP, 238 mod_set_boolean, mod_get_boolean, 239 {B_TRUE}, {B_TRUE} }, 240 241 { "send_maxbuf", MOD_PROTO_RAWIP, 242 mod_set_uint32, mod_get_uint32, 243 {4096, 65536, 8192}, {8192} }, 244 245 { "icmp_xmit_lowat", MOD_PROTO_RAWIP, 246 mod_set_uint32, mod_get_uint32, 247 {0, 65536, 1024}, {1024} }, 248 249 { "recv_maxbuf", MOD_PROTO_RAWIP, 250 mod_set_uint32, mod_get_uint32, 251 {4096, 65536, 8192}, {8192} }, 252 253 { "icmp_max_buf", MOD_PROTO_RAWIP, 254 mod_set_uint32, mod_get_uint32, 255 {65536, 1024*1024*1024, 256*1024}, {256 * 1024} }, 256 257 { "icmp_pmtu_discovery", MOD_PROTO_RAWIP, 258 mod_set_boolean, mod_get_boolean, 259 {B_FALSE}, {B_FALSE} }, 260 261 { "icmp_sendto_ignerr", MOD_PROTO_RAWIP, 262 mod_set_boolean, mod_get_boolean, 263 {B_FALSE}, {B_FALSE} }, 264 265 { "?", MOD_PROTO_RAWIP, NULL, mod_get_allprop, {0}, {0} }, 266 267 { NULL, 0, NULL, NULL, {0}, {0} } 268 }; 269 270 #define is_wroff_extra is_propinfo_tbl[0].prop_cur_uval 271 #define is_ipv4_ttl is_propinfo_tbl[1].prop_cur_uval 272 #define is_ipv6_hoplimit is_propinfo_tbl[2].prop_cur_uval 273 #define is_bsd_compat is_propinfo_tbl[3].prop_cur_bval 274 #define is_xmit_hiwat is_propinfo_tbl[4].prop_cur_uval 275 #define is_xmit_lowat is_propinfo_tbl[5].prop_cur_uval 276 #define is_recv_hiwat is_propinfo_tbl[6].prop_cur_uval 277 #define is_max_buf is_propinfo_tbl[7].prop_cur_uval 278 #define is_pmtu_discovery is_propinfo_tbl[8].prop_cur_bval 279 #define is_sendto_ignerr is_propinfo_tbl[9].prop_cur_bval 280 281 typedef union T_primitives *t_primp_t; 282 283 /* 284 * This routine is called to handle each O_T_BIND_REQ/T_BIND_REQ message 285 * passed to icmp_wput. 286 * It calls IP to verify the local IP address, and calls IP to insert 287 * the conn_t in the fanout table. 288 * If everything is ok it then sends the T_BIND_ACK back up. 289 */ 290 static void 291 icmp_tpi_bind(queue_t *q, mblk_t *mp) 292 { 293 int error; 294 struct sockaddr *sa; 295 struct T_bind_req *tbr; 296 socklen_t len; 297 sin_t *sin; 298 sin6_t *sin6; 299 icmp_t *icmp; 300 conn_t *connp = Q_TO_CONN(q); 301 mblk_t *mp1; 302 cred_t *cr; 303 304 /* 305 * All Solaris components should pass a db_credp 306 * for this TPI message, hence we ASSERT. 307 * But in case there is some other M_PROTO that looks 308 * like a TPI message sent by some other kernel 309 * component, we check and return an error. 310 */ 311 cr = msg_getcred(mp, NULL); 312 ASSERT(cr != NULL); 313 if (cr == NULL) { 314 icmp_err_ack(q, mp, TSYSERR, EINVAL); 315 return; 316 } 317 318 icmp = connp->conn_icmp; 319 if ((mp->b_wptr - mp->b_rptr) < sizeof (*tbr)) { 320 (void) mi_strlog(q, 1, SL_ERROR|SL_TRACE, 321 "icmp_bind: bad req, len %u", 322 (uint_t)(mp->b_wptr - mp->b_rptr)); 323 icmp_err_ack(q, mp, TPROTO, 0); 324 return; 325 } 326 327 if (icmp->icmp_state != TS_UNBND) { 328 (void) mi_strlog(q, 1, SL_ERROR|SL_TRACE, 329 "icmp_bind: bad state, %u", icmp->icmp_state); 330 icmp_err_ack(q, mp, TOUTSTATE, 0); 331 return; 332 } 333 334 /* 335 * Reallocate the message to make sure we have enough room for an 336 * address. 337 */ 338 mp1 = reallocb(mp, sizeof (struct T_bind_ack) + sizeof (sin6_t), 1); 339 if (mp1 == NULL) { 340 icmp_err_ack(q, mp, TSYSERR, ENOMEM); 341 return; 342 } 343 mp = mp1; 344 345 /* Reset the message type in preparation for shipping it back. */ 346 DB_TYPE(mp) = M_PCPROTO; 347 tbr = (struct T_bind_req *)mp->b_rptr; 348 len = tbr->ADDR_length; 349 switch (len) { 350 case 0: /* request for a generic port */ 351 tbr->ADDR_offset = sizeof (struct T_bind_req); 352 if (connp->conn_family == AF_INET) { 353 tbr->ADDR_length = sizeof (sin_t); 354 sin = (sin_t *)&tbr[1]; 355 *sin = sin_null; 356 sin->sin_family = AF_INET; 357 mp->b_wptr = (uchar_t *)&sin[1]; 358 sa = (struct sockaddr *)sin; 359 len = sizeof (sin_t); 360 } else { 361 ASSERT(connp->conn_family == AF_INET6); 362 tbr->ADDR_length = sizeof (sin6_t); 363 sin6 = (sin6_t *)&tbr[1]; 364 *sin6 = sin6_null; 365 sin6->sin6_family = AF_INET6; 366 mp->b_wptr = (uchar_t *)&sin6[1]; 367 sa = (struct sockaddr *)sin6; 368 len = sizeof (sin6_t); 369 } 370 break; 371 372 case sizeof (sin_t): /* Complete IPv4 address */ 373 sa = (struct sockaddr *)mi_offset_param(mp, tbr->ADDR_offset, 374 sizeof (sin_t)); 375 break; 376 377 case sizeof (sin6_t): /* Complete IPv6 address */ 378 sa = (struct sockaddr *)mi_offset_param(mp, 379 tbr->ADDR_offset, sizeof (sin6_t)); 380 break; 381 382 default: 383 (void) mi_strlog(q, 1, SL_ERROR|SL_TRACE, 384 "icmp_bind: bad ADDR_length %u", tbr->ADDR_length); 385 icmp_err_ack(q, mp, TBADADDR, 0); 386 return; 387 } 388 389 error = rawip_do_bind(connp, sa, len); 390 if (error != 0) { 391 if (error > 0) { 392 icmp_err_ack(q, mp, TSYSERR, error); 393 } else { 394 icmp_err_ack(q, mp, -error, 0); 395 } 396 } else { 397 tbr->PRIM_type = T_BIND_ACK; 398 qreply(q, mp); 399 } 400 } 401 402 static int 403 rawip_do_bind(conn_t *connp, struct sockaddr *sa, socklen_t len) 404 { 405 sin_t *sin; 406 sin6_t *sin6; 407 icmp_t *icmp = connp->conn_icmp; 408 int error = 0; 409 ip_laddr_t laddr_type = IPVL_UNICAST_UP; /* INADDR_ANY */ 410 in_port_t lport; /* Network byte order */ 411 ipaddr_t v4src; /* Set if AF_INET */ 412 in6_addr_t v6src; 413 uint_t scopeid = 0; 414 zoneid_t zoneid = IPCL_ZONEID(connp); 415 ip_stack_t *ipst = connp->conn_netstack->netstack_ip; 416 417 if (sa == NULL || !OK_32PTR((char *)sa)) { 418 return (EINVAL); 419 } 420 421 switch (len) { 422 case sizeof (sin_t): /* Complete IPv4 address */ 423 sin = (sin_t *)sa; 424 if (sin->sin_family != AF_INET || 425 connp->conn_family != AF_INET) { 426 /* TSYSERR, EAFNOSUPPORT */ 427 return (EAFNOSUPPORT); 428 } 429 v4src = sin->sin_addr.s_addr; 430 IN6_IPADDR_TO_V4MAPPED(v4src, &v6src); 431 if (v4src != INADDR_ANY) { 432 laddr_type = ip_laddr_verify_v4(v4src, zoneid, ipst, 433 B_TRUE); 434 } 435 lport = sin->sin_port; 436 break; 437 case sizeof (sin6_t): /* Complete IPv6 address */ 438 sin6 = (sin6_t *)sa; 439 if (sin6->sin6_family != AF_INET6 || 440 connp->conn_family != AF_INET6) { 441 /* TSYSERR, EAFNOSUPPORT */ 442 return (EAFNOSUPPORT); 443 } 444 /* No support for mapped addresses on raw sockets */ 445 if (IN6_IS_ADDR_V4MAPPED(&sin6->sin6_addr)) { 446 /* TSYSERR, EADDRNOTAVAIL */ 447 return (EADDRNOTAVAIL); 448 } 449 v6src = sin6->sin6_addr; 450 if (!IN6_IS_ADDR_UNSPECIFIED(&v6src)) { 451 if (IN6_IS_ADDR_LINKSCOPE(&v6src)) 452 scopeid = sin6->sin6_scope_id; 453 laddr_type = ip_laddr_verify_v6(&v6src, zoneid, ipst, 454 B_TRUE, scopeid); 455 } 456 lport = sin6->sin6_port; 457 break; 458 459 default: 460 /* TBADADDR */ 461 return (EADDRNOTAVAIL); 462 } 463 464 /* Is the local address a valid unicast, multicast, or broadcast? */ 465 if (laddr_type == IPVL_BAD) 466 return (EADDRNOTAVAIL); 467 468 /* 469 * The state must be TS_UNBND. 470 */ 471 mutex_enter(&connp->conn_lock); 472 if (icmp->icmp_state != TS_UNBND) { 473 mutex_exit(&connp->conn_lock); 474 return (-TOUTSTATE); 475 } 476 477 /* 478 * Copy the source address into our icmp structure. This address 479 * may still be zero; if so, ip will fill in the correct address 480 * each time an outbound packet is passed to it. 481 * If we are binding to a broadcast or multicast address then 482 * we just set the conn_bound_addr since we don't want to use 483 * that as the source address when sending. 484 */ 485 connp->conn_bound_addr_v6 = v6src; 486 connp->conn_laddr_v6 = v6src; 487 if (scopeid != 0) { 488 connp->conn_ixa->ixa_flags |= IXAF_SCOPEID_SET; 489 connp->conn_ixa->ixa_scopeid = scopeid; 490 connp->conn_incoming_ifindex = scopeid; 491 } else { 492 connp->conn_ixa->ixa_flags &= ~IXAF_SCOPEID_SET; 493 connp->conn_incoming_ifindex = connp->conn_bound_if; 494 } 495 496 switch (laddr_type) { 497 case IPVL_UNICAST_UP: 498 case IPVL_UNICAST_DOWN: 499 connp->conn_saddr_v6 = v6src; 500 connp->conn_mcbc_bind = B_FALSE; 501 break; 502 case IPVL_MCAST: 503 case IPVL_BCAST: 504 /* ip_set_destination will pick a source address later */ 505 connp->conn_saddr_v6 = ipv6_all_zeros; 506 connp->conn_mcbc_bind = B_TRUE; 507 break; 508 } 509 510 /* Any errors after this point should use late_error */ 511 512 /* 513 * Use sin_port/sin6_port since applications like psh use SOCK_RAW 514 * with IPPROTO_TCP. 515 */ 516 connp->conn_lport = lport; 517 connp->conn_fport = 0; 518 519 if (connp->conn_family == AF_INET) { 520 ASSERT(connp->conn_ipversion == IPV4_VERSION); 521 } else { 522 ASSERT(connp->conn_ipversion == IPV6_VERSION); 523 } 524 525 icmp->icmp_state = TS_IDLE; 526 527 /* 528 * We create an initial header template here to make a subsequent 529 * sendto have a starting point. Since conn_last_dst is zero the 530 * first sendto will always follow the 'dst changed' code path. 531 * Note that we defer massaging options and the related checksum 532 * adjustment until we have a destination address. 533 */ 534 error = icmp_build_hdr_template(connp, &connp->conn_saddr_v6, 535 &connp->conn_faddr_v6, connp->conn_flowinfo); 536 if (error != 0) { 537 mutex_exit(&connp->conn_lock); 538 goto late_error; 539 } 540 /* Just in case */ 541 connp->conn_faddr_v6 = ipv6_all_zeros; 542 connp->conn_v6lastdst = ipv6_all_zeros; 543 mutex_exit(&connp->conn_lock); 544 545 error = ip_laddr_fanout_insert(connp); 546 if (error != 0) 547 goto late_error; 548 549 /* Bind succeeded */ 550 return (0); 551 552 late_error: 553 mutex_enter(&connp->conn_lock); 554 connp->conn_saddr_v6 = ipv6_all_zeros; 555 connp->conn_bound_addr_v6 = ipv6_all_zeros; 556 connp->conn_laddr_v6 = ipv6_all_zeros; 557 if (scopeid != 0) { 558 connp->conn_ixa->ixa_flags &= ~IXAF_SCOPEID_SET; 559 connp->conn_incoming_ifindex = connp->conn_bound_if; 560 } 561 icmp->icmp_state = TS_UNBND; 562 connp->conn_v6lastdst = ipv6_all_zeros; 563 connp->conn_lport = 0; 564 565 /* Restore the header that was built above - different source address */ 566 (void) icmp_build_hdr_template(connp, &connp->conn_saddr_v6, 567 &connp->conn_faddr_v6, connp->conn_flowinfo); 568 mutex_exit(&connp->conn_lock); 569 return (error); 570 } 571 572 /* 573 * Tell IP to just bind to the protocol. 574 */ 575 static void 576 icmp_bind_proto(icmp_t *icmp) 577 { 578 conn_t *connp = icmp->icmp_connp; 579 580 mutex_enter(&connp->conn_lock); 581 connp->conn_saddr_v6 = ipv6_all_zeros; 582 connp->conn_laddr_v6 = ipv6_all_zeros; 583 connp->conn_faddr_v6 = ipv6_all_zeros; 584 connp->conn_v6lastdst = ipv6_all_zeros; 585 mutex_exit(&connp->conn_lock); 586 587 (void) ip_laddr_fanout_insert(connp); 588 } 589 590 /* 591 * This routine handles each T_CONN_REQ message passed to icmp. It 592 * associates a default destination address with the stream. 593 * 594 * After various error checks are completed, icmp_connect() lays 595 * the target address and port into the composite header template. 596 * Then we ask IP for information, including a source address if we didn't 597 * already have one. Finally we send up the T_OK_ACK reply message. 598 */ 599 static void 600 icmp_tpi_connect(queue_t *q, mblk_t *mp) 601 { 602 conn_t *connp = Q_TO_CONN(q); 603 struct T_conn_req *tcr; 604 struct sockaddr *sa; 605 socklen_t len; 606 int error; 607 cred_t *cr; 608 pid_t pid; 609 /* 610 * All Solaris components should pass a db_credp 611 * for this TPI message, hence we ASSERT. 612 * But in case there is some other M_PROTO that looks 613 * like a TPI message sent by some other kernel 614 * component, we check and return an error. 615 */ 616 cr = msg_getcred(mp, &pid); 617 ASSERT(cr != NULL); 618 if (cr == NULL) { 619 icmp_err_ack(q, mp, TSYSERR, EINVAL); 620 return; 621 } 622 623 tcr = (struct T_conn_req *)mp->b_rptr; 624 /* Sanity checks */ 625 if ((mp->b_wptr - mp->b_rptr) < sizeof (struct T_conn_req)) { 626 icmp_err_ack(q, mp, TPROTO, 0); 627 return; 628 } 629 630 if (tcr->OPT_length != 0) { 631 icmp_err_ack(q, mp, TBADOPT, 0); 632 return; 633 } 634 635 len = tcr->DEST_length; 636 637 switch (len) { 638 default: 639 icmp_err_ack(q, mp, TBADADDR, 0); 640 return; 641 case sizeof (sin_t): 642 sa = (struct sockaddr *)mi_offset_param(mp, tcr->DEST_offset, 643 sizeof (sin_t)); 644 break; 645 case sizeof (sin6_t): 646 sa = (struct sockaddr *)mi_offset_param(mp, 647 tcr->DEST_offset, sizeof (sin6_t)); 648 break; 649 } 650 651 error = proto_verify_ip_addr(connp->conn_family, sa, len); 652 if (error != 0) { 653 icmp_err_ack(q, mp, TSYSERR, error); 654 return; 655 } 656 657 error = rawip_do_connect(connp, sa, len, cr, pid); 658 if (error != 0) { 659 if (error < 0) { 660 icmp_err_ack(q, mp, -error, 0); 661 } else { 662 icmp_err_ack(q, mp, 0, error); 663 } 664 } else { 665 mblk_t *mp1; 666 667 /* 668 * We have to send a connection confirmation to 669 * keep TLI happy. 670 */ 671 if (connp->conn_family == AF_INET) { 672 mp1 = mi_tpi_conn_con(NULL, (char *)sa, 673 sizeof (sin_t), NULL, 0); 674 } else { 675 ASSERT(connp->conn_family == AF_INET6); 676 mp1 = mi_tpi_conn_con(NULL, (char *)sa, 677 sizeof (sin6_t), NULL, 0); 678 } 679 if (mp1 == NULL) { 680 icmp_err_ack(q, mp, TSYSERR, ENOMEM); 681 return; 682 } 683 684 /* 685 * Send ok_ack for T_CONN_REQ 686 */ 687 mp = mi_tpi_ok_ack_alloc(mp); 688 if (mp == NULL) { 689 /* Unable to reuse the T_CONN_REQ for the ack. */ 690 icmp_err_ack_prim(q, mp1, T_CONN_REQ, TSYSERR, ENOMEM); 691 return; 692 } 693 putnext(connp->conn_rq, mp); 694 putnext(connp->conn_rq, mp1); 695 } 696 } 697 698 static int 699 rawip_do_connect(conn_t *connp, const struct sockaddr *sa, socklen_t len, 700 cred_t *cr, pid_t pid) 701 { 702 icmp_t *icmp; 703 sin_t *sin; 704 sin6_t *sin6; 705 int error; 706 uint16_t dstport; 707 ipaddr_t v4dst; 708 in6_addr_t v6dst; 709 uint32_t flowinfo; 710 ip_xmit_attr_t *ixa; 711 ip_xmit_attr_t *oldixa; 712 uint_t scopeid = 0; 713 uint_t srcid = 0; 714 in6_addr_t v6src = connp->conn_saddr_v6; 715 716 icmp = connp->conn_icmp; 717 718 if (sa == NULL || !OK_32PTR((char *)sa)) { 719 return (EINVAL); 720 } 721 722 ASSERT(sa != NULL && len != 0); 723 724 /* 725 * Determine packet type based on type of address passed in 726 * the request should contain an IPv4 or IPv6 address. 727 * Make sure that address family matches the type of 728 * family of the address passed down. 729 */ 730 switch (len) { 731 case sizeof (sin_t): 732 sin = (sin_t *)sa; 733 734 v4dst = sin->sin_addr.s_addr; 735 dstport = sin->sin_port; 736 IN6_IPADDR_TO_V4MAPPED(v4dst, &v6dst); 737 ASSERT(connp->conn_ipversion == IPV4_VERSION); 738 break; 739 740 case sizeof (sin6_t): 741 sin6 = (sin6_t *)sa; 742 743 /* No support for mapped addresses on raw sockets */ 744 if (IN6_IS_ADDR_V4MAPPED(&sin6->sin6_addr)) { 745 return (EADDRNOTAVAIL); 746 } 747 v6dst = sin6->sin6_addr; 748 dstport = sin6->sin6_port; 749 ASSERT(connp->conn_ipversion == IPV6_VERSION); 750 flowinfo = sin6->sin6_flowinfo; 751 if (IN6_IS_ADDR_LINKLOCAL(&sin6->sin6_addr)) 752 scopeid = sin6->sin6_scope_id; 753 srcid = sin6->__sin6_src_id; 754 if (srcid != 0 && IN6_IS_ADDR_UNSPECIFIED(&v6src)) { 755 ip_srcid_find_id(srcid, &v6src, IPCL_ZONEID(connp), 756 connp->conn_netstack); 757 } 758 break; 759 } 760 761 /* 762 * If there is a different thread using conn_ixa then we get a new 763 * copy and cut the old one loose from conn_ixa. Otherwise we use 764 * conn_ixa and prevent any other thread from using/changing it. 765 * Once connect() is done other threads can use conn_ixa since the 766 * refcnt will be back at one. 767 * We defer updating conn_ixa until later to handle any concurrent 768 * conn_ixa_cleanup thread. 769 */ 770 ixa = conn_get_ixa(connp, B_FALSE); 771 if (ixa == NULL) 772 return (ENOMEM); 773 774 ASSERT(ixa->ixa_refcnt >= 2); 775 ASSERT(ixa == connp->conn_ixa); 776 777 mutex_enter(&connp->conn_lock); 778 /* 779 * This icmp_t must have bound already before doing a connect. 780 * Reject if a connect is in progress (we drop conn_lock during 781 * rawip_do_connect). 782 */ 783 if (icmp->icmp_state == TS_UNBND || icmp->icmp_state == TS_WCON_CREQ) { 784 mutex_exit(&connp->conn_lock); 785 ixa_refrele(ixa); 786 return (-TOUTSTATE); 787 } 788 789 if (icmp->icmp_state == TS_DATA_XFER) { 790 /* Already connected - clear out state */ 791 if (connp->conn_mcbc_bind) 792 connp->conn_saddr_v6 = ipv6_all_zeros; 793 else 794 connp->conn_saddr_v6 = connp->conn_bound_addr_v6; 795 connp->conn_laddr_v6 = connp->conn_bound_addr_v6; 796 connp->conn_faddr_v6 = ipv6_all_zeros; 797 icmp->icmp_state = TS_IDLE; 798 } 799 800 /* 801 * Use sin_port/sin6_port since applications like psh use SOCK_RAW 802 * with IPPROTO_TCP. 803 */ 804 connp->conn_fport = dstport; 805 if (connp->conn_ipversion == IPV4_VERSION) { 806 /* 807 * Interpret a zero destination to mean loopback. 808 * Update the T_CONN_REQ (sin/sin6) since it is used to 809 * generate the T_CONN_CON. 810 */ 811 if (v4dst == INADDR_ANY) { 812 v4dst = htonl(INADDR_LOOPBACK); 813 IN6_IPADDR_TO_V4MAPPED(v4dst, &v6dst); 814 ASSERT(connp->conn_family == AF_INET); 815 sin->sin_addr.s_addr = v4dst; 816 } 817 connp->conn_faddr_v6 = v6dst; 818 connp->conn_flowinfo = 0; 819 } else { 820 ASSERT(connp->conn_ipversion == IPV6_VERSION); 821 /* 822 * Interpret a zero destination to mean loopback. 823 * Update the T_CONN_REQ (sin/sin6) since it is used to 824 * generate the T_CONN_CON. 825 */ 826 if (IN6_IS_ADDR_UNSPECIFIED(&v6dst)) { 827 v6dst = ipv6_loopback; 828 sin6->sin6_addr = v6dst; 829 } 830 connp->conn_faddr_v6 = v6dst; 831 connp->conn_flowinfo = flowinfo; 832 } 833 834 /* 835 * We update our cred/cpid based on the caller of connect 836 */ 837 if (connp->conn_cred != cr) { 838 crhold(cr); 839 crfree(connp->conn_cred); 840 connp->conn_cred = cr; 841 } 842 connp->conn_cpid = pid; 843 ASSERT(!(ixa->ixa_free_flags & IXA_FREE_CRED)); 844 ixa->ixa_cred = cr; 845 ixa->ixa_cpid = pid; 846 if (is_system_labeled()) { 847 /* We need to restart with a label based on the cred */ 848 ip_xmit_attr_restore_tsl(ixa, ixa->ixa_cred); 849 } 850 851 if (scopeid != 0) { 852 ixa->ixa_flags |= IXAF_SCOPEID_SET; 853 ixa->ixa_scopeid = scopeid; 854 connp->conn_incoming_ifindex = scopeid; 855 } else { 856 ixa->ixa_flags &= ~IXAF_SCOPEID_SET; 857 connp->conn_incoming_ifindex = connp->conn_bound_if; 858 } 859 860 /* 861 * conn_connect will drop conn_lock and reacquire it. 862 * To prevent a send* from messing with this icmp_t while the lock 863 * is dropped we set icmp_state and clear conn_v6lastdst. 864 * That will make all send* fail with EISCONN. 865 */ 866 connp->conn_v6lastdst = ipv6_all_zeros; 867 icmp->icmp_state = TS_WCON_CREQ; 868 869 error = conn_connect(connp, NULL, IPDF_ALLOW_MCBC); 870 mutex_exit(&connp->conn_lock); 871 if (error != 0) 872 goto connect_failed; 873 874 /* 875 * The addresses have been verified. Time to insert in 876 * the correct fanout list. 877 */ 878 error = ipcl_conn_insert(connp); 879 if (error != 0) 880 goto connect_failed; 881 882 mutex_enter(&connp->conn_lock); 883 error = icmp_build_hdr_template(connp, &connp->conn_saddr_v6, 884 &connp->conn_faddr_v6, connp->conn_flowinfo); 885 if (error != 0) { 886 mutex_exit(&connp->conn_lock); 887 goto connect_failed; 888 } 889 890 icmp->icmp_state = TS_DATA_XFER; 891 /* Record this as the "last" send even though we haven't sent any */ 892 connp->conn_v6lastdst = connp->conn_faddr_v6; 893 connp->conn_lastipversion = connp->conn_ipversion; 894 connp->conn_lastdstport = connp->conn_fport; 895 connp->conn_lastflowinfo = connp->conn_flowinfo; 896 connp->conn_lastscopeid = scopeid; 897 connp->conn_lastsrcid = srcid; 898 /* Also remember a source to use together with lastdst */ 899 connp->conn_v6lastsrc = v6src; 900 901 oldixa = conn_replace_ixa(connp, ixa); 902 mutex_exit(&connp->conn_lock); 903 ixa_refrele(oldixa); 904 905 ixa_refrele(ixa); 906 return (0); 907 908 connect_failed: 909 if (ixa != NULL) 910 ixa_refrele(ixa); 911 mutex_enter(&connp->conn_lock); 912 icmp->icmp_state = TS_IDLE; 913 /* In case the source address was set above */ 914 if (connp->conn_mcbc_bind) 915 connp->conn_saddr_v6 = ipv6_all_zeros; 916 else 917 connp->conn_saddr_v6 = connp->conn_bound_addr_v6; 918 connp->conn_laddr_v6 = connp->conn_bound_addr_v6; 919 connp->conn_faddr_v6 = ipv6_all_zeros; 920 connp->conn_v6lastdst = ipv6_all_zeros; 921 connp->conn_flowinfo = 0; 922 923 (void) icmp_build_hdr_template(connp, &connp->conn_saddr_v6, 924 &connp->conn_faddr_v6, connp->conn_flowinfo); 925 mutex_exit(&connp->conn_lock); 926 return (error); 927 } 928 929 static void 930 rawip_do_close(conn_t *connp) 931 { 932 ASSERT(connp != NULL && IPCL_IS_RAWIP(connp)); 933 934 ip_quiesce_conn(connp); 935 936 if (!IPCL_IS_NONSTR(connp)) { 937 qprocsoff(connp->conn_rq); 938 } 939 940 icmp_close_free(connp); 941 942 /* 943 * Now we are truly single threaded on this stream, and can 944 * delete the things hanging off the connp, and finally the connp. 945 * We removed this connp from the fanout list, it cannot be 946 * accessed thru the fanouts, and we already waited for the 947 * conn_ref to drop to 0. We are already in close, so 948 * there cannot be any other thread from the top. qprocsoff 949 * has completed, and service has completed or won't run in 950 * future. 951 */ 952 ASSERT(connp->conn_ref == 1); 953 954 if (!IPCL_IS_NONSTR(connp)) { 955 inet_minor_free(connp->conn_minor_arena, connp->conn_dev); 956 } else { 957 ip_free_helper_stream(connp); 958 } 959 960 connp->conn_ref--; 961 ipcl_conn_destroy(connp); 962 } 963 964 static int 965 icmp_close(queue_t *q, int flags) 966 { 967 conn_t *connp; 968 969 if (flags & SO_FALLBACK) { 970 /* 971 * stream is being closed while in fallback 972 * simply free the resources that were allocated 973 */ 974 inet_minor_free(WR(q)->q_ptr, (dev_t)(RD(q)->q_ptr)); 975 qprocsoff(q); 976 goto done; 977 } 978 979 connp = Q_TO_CONN(q); 980 (void) rawip_do_close(connp); 981 done: 982 q->q_ptr = WR(q)->q_ptr = NULL; 983 return (0); 984 } 985 986 static void 987 icmp_close_free(conn_t *connp) 988 { 989 icmp_t *icmp = connp->conn_icmp; 990 991 if (icmp->icmp_filter != NULL) { 992 kmem_free(icmp->icmp_filter, sizeof (icmp6_filter_t)); 993 icmp->icmp_filter = NULL; 994 } 995 996 /* 997 * Clear any fields which the kmem_cache constructor clears. 998 * Only icmp_connp needs to be preserved. 999 * TBD: We should make this more efficient to avoid clearing 1000 * everything. 1001 */ 1002 ASSERT(icmp->icmp_connp == connp); 1003 bzero(icmp, sizeof (icmp_t)); 1004 icmp->icmp_connp = connp; 1005 } 1006 1007 /* 1008 * This routine handles each T_DISCON_REQ message passed to icmp 1009 * as an indicating that ICMP is no longer connected. This results 1010 * in telling IP to restore the binding to just the local address. 1011 */ 1012 static int 1013 icmp_do_disconnect(conn_t *connp) 1014 { 1015 icmp_t *icmp = connp->conn_icmp; 1016 int error; 1017 1018 mutex_enter(&connp->conn_lock); 1019 if (icmp->icmp_state != TS_DATA_XFER) { 1020 mutex_exit(&connp->conn_lock); 1021 return (-TOUTSTATE); 1022 } 1023 if (connp->conn_mcbc_bind) 1024 connp->conn_saddr_v6 = ipv6_all_zeros; 1025 else 1026 connp->conn_saddr_v6 = connp->conn_bound_addr_v6; 1027 connp->conn_laddr_v6 = connp->conn_bound_addr_v6; 1028 connp->conn_faddr_v6 = ipv6_all_zeros; 1029 icmp->icmp_state = TS_IDLE; 1030 1031 connp->conn_v6lastdst = ipv6_all_zeros; 1032 error = icmp_build_hdr_template(connp, &connp->conn_saddr_v6, 1033 &connp->conn_faddr_v6, connp->conn_flowinfo); 1034 mutex_exit(&connp->conn_lock); 1035 if (error != 0) 1036 return (error); 1037 1038 /* 1039 * Tell IP to remove the full binding and revert 1040 * to the local address binding. 1041 */ 1042 return (ip_laddr_fanout_insert(connp)); 1043 } 1044 1045 static void 1046 icmp_tpi_disconnect(queue_t *q, mblk_t *mp) 1047 { 1048 conn_t *connp = Q_TO_CONN(q); 1049 int error; 1050 1051 /* 1052 * Allocate the largest primitive we need to send back 1053 * T_error_ack is > than T_ok_ack 1054 */ 1055 mp = reallocb(mp, sizeof (struct T_error_ack), 1); 1056 if (mp == NULL) { 1057 /* Unable to reuse the T_DISCON_REQ for the ack. */ 1058 icmp_err_ack_prim(q, mp, T_DISCON_REQ, TSYSERR, ENOMEM); 1059 return; 1060 } 1061 1062 error = icmp_do_disconnect(connp); 1063 1064 if (error != 0) { 1065 if (error > 0) { 1066 icmp_err_ack(q, mp, 0, error); 1067 } else { 1068 icmp_err_ack(q, mp, -error, 0); 1069 } 1070 } else { 1071 mp = mi_tpi_ok_ack_alloc(mp); 1072 ASSERT(mp != NULL); 1073 qreply(q, mp); 1074 } 1075 } 1076 1077 static int 1078 icmp_disconnect(conn_t *connp) 1079 { 1080 int error; 1081 1082 connp->conn_dgram_errind = B_FALSE; 1083 1084 error = icmp_do_disconnect(connp); 1085 1086 if (error < 0) 1087 error = proto_tlitosyserr(-error); 1088 return (error); 1089 } 1090 1091 /* This routine creates a T_ERROR_ACK message and passes it upstream. */ 1092 static void 1093 icmp_err_ack(queue_t *q, mblk_t *mp, t_scalar_t t_error, int sys_error) 1094 { 1095 if ((mp = mi_tpi_err_ack_alloc(mp, t_error, sys_error)) != NULL) 1096 qreply(q, mp); 1097 } 1098 1099 /* Shorthand to generate and send TPI error acks to our client */ 1100 static void 1101 icmp_err_ack_prim(queue_t *q, mblk_t *mp, t_scalar_t primitive, 1102 t_scalar_t t_error, int sys_error) 1103 { 1104 struct T_error_ack *teackp; 1105 1106 if ((mp = tpi_ack_alloc(mp, sizeof (struct T_error_ack), 1107 M_PCPROTO, T_ERROR_ACK)) != NULL) { 1108 teackp = (struct T_error_ack *)mp->b_rptr; 1109 teackp->ERROR_prim = primitive; 1110 teackp->TLI_error = t_error; 1111 teackp->UNIX_error = sys_error; 1112 qreply(q, mp); 1113 } 1114 } 1115 1116 /* 1117 * icmp_icmp_input is called as conn_recvicmp to process ICMP messages. 1118 * Generates the appropriate T_UDERROR_IND for permanent (non-transient) errors. 1119 * Assumes that IP has pulled up everything up to and including the ICMP header. 1120 */ 1121 /* ARGSUSED2 */ 1122 static void 1123 icmp_icmp_input(void *arg1, mblk_t *mp, void *arg2, ip_recv_attr_t *ira) 1124 { 1125 conn_t *connp = (conn_t *)arg1; 1126 icmp_t *icmp = connp->conn_icmp; 1127 icmph_t *icmph; 1128 ipha_t *ipha; 1129 int iph_hdr_length; 1130 sin_t sin; 1131 mblk_t *mp1; 1132 int error = 0; 1133 1134 ipha = (ipha_t *)mp->b_rptr; 1135 1136 ASSERT(OK_32PTR(mp->b_rptr)); 1137 1138 if (IPH_HDR_VERSION(ipha) != IPV4_VERSION) { 1139 ASSERT(IPH_HDR_VERSION(ipha) == IPV6_VERSION); 1140 icmp_icmp_error_ipv6(connp, mp, ira); 1141 return; 1142 } 1143 ASSERT(IPH_HDR_VERSION(ipha) == IPV4_VERSION); 1144 1145 /* Skip past the outer IP and ICMP headers */ 1146 ASSERT(IPH_HDR_LENGTH(ipha) == ira->ira_ip_hdr_length); 1147 iph_hdr_length = ira->ira_ip_hdr_length; 1148 icmph = (icmph_t *)&mp->b_rptr[iph_hdr_length]; 1149 ipha = (ipha_t *)&icmph[1]; /* Inner IP header */ 1150 1151 iph_hdr_length = IPH_HDR_LENGTH(ipha); 1152 1153 switch (icmph->icmph_type) { 1154 case ICMP_DEST_UNREACHABLE: 1155 switch (icmph->icmph_code) { 1156 case ICMP_FRAGMENTATION_NEEDED: { 1157 ipha_t *ipha; 1158 ip_xmit_attr_t *ixa; 1159 /* 1160 * IP has already adjusted the path MTU. 1161 * But we need to adjust DF for IPv4. 1162 */ 1163 if (connp->conn_ipversion != IPV4_VERSION) 1164 break; 1165 1166 ixa = conn_get_ixa(connp, B_FALSE); 1167 if (ixa == NULL || ixa->ixa_ire == NULL) { 1168 /* 1169 * Some other thread holds conn_ixa. We will 1170 * redo this on the next ICMP too big. 1171 */ 1172 if (ixa != NULL) 1173 ixa_refrele(ixa); 1174 break; 1175 } 1176 (void) ip_get_pmtu(ixa); 1177 1178 mutex_enter(&connp->conn_lock); 1179 ipha = (ipha_t *)connp->conn_ht_iphc; 1180 if (ixa->ixa_flags & IXAF_PMTU_IPV4_DF) { 1181 ipha->ipha_fragment_offset_and_flags |= 1182 IPH_DF_HTONS; 1183 } else { 1184 ipha->ipha_fragment_offset_and_flags &= 1185 ~IPH_DF_HTONS; 1186 } 1187 mutex_exit(&connp->conn_lock); 1188 ixa_refrele(ixa); 1189 break; 1190 } 1191 case ICMP_PORT_UNREACHABLE: 1192 case ICMP_PROTOCOL_UNREACHABLE: 1193 error = ECONNREFUSED; 1194 break; 1195 default: 1196 /* Transient errors */ 1197 break; 1198 } 1199 break; 1200 default: 1201 /* Transient errors */ 1202 break; 1203 } 1204 if (error == 0) { 1205 freemsg(mp); 1206 return; 1207 } 1208 1209 /* 1210 * Deliver T_UDERROR_IND when the application has asked for it. 1211 * The socket layer enables this automatically when connected. 1212 */ 1213 if (!connp->conn_dgram_errind) { 1214 freemsg(mp); 1215 return; 1216 } 1217 1218 sin = sin_null; 1219 sin.sin_family = AF_INET; 1220 sin.sin_addr.s_addr = ipha->ipha_dst; 1221 1222 if (IPCL_IS_NONSTR(connp)) { 1223 mutex_enter(&connp->conn_lock); 1224 if (icmp->icmp_state == TS_DATA_XFER) { 1225 if (sin.sin_addr.s_addr == connp->conn_faddr_v4) { 1226 mutex_exit(&connp->conn_lock); 1227 (*connp->conn_upcalls->su_set_error) 1228 (connp->conn_upper_handle, error); 1229 goto done; 1230 } 1231 } else { 1232 icmp->icmp_delayed_error = error; 1233 *((sin_t *)&icmp->icmp_delayed_addr) = sin; 1234 } 1235 mutex_exit(&connp->conn_lock); 1236 } else { 1237 mp1 = mi_tpi_uderror_ind((char *)&sin, sizeof (sin_t), NULL, 0, 1238 error); 1239 if (mp1 != NULL) 1240 putnext(connp->conn_rq, mp1); 1241 } 1242 done: 1243 freemsg(mp); 1244 } 1245 1246 /* 1247 * icmp_icmp_error_ipv6 is called by icmp_icmp_error to process ICMP for IPv6. 1248 * Generates the appropriate T_UDERROR_IND for permanent (non-transient) errors. 1249 * Assumes that IP has pulled up all the extension headers as well as the 1250 * ICMPv6 header. 1251 */ 1252 static void 1253 icmp_icmp_error_ipv6(conn_t *connp, mblk_t *mp, ip_recv_attr_t *ira) 1254 { 1255 icmp6_t *icmp6; 1256 ip6_t *ip6h, *outer_ip6h; 1257 uint16_t iph_hdr_length; 1258 uint8_t *nexthdrp; 1259 sin6_t sin6; 1260 mblk_t *mp1; 1261 int error = 0; 1262 icmp_t *icmp = connp->conn_icmp; 1263 1264 outer_ip6h = (ip6_t *)mp->b_rptr; 1265 #ifdef DEBUG 1266 if (outer_ip6h->ip6_nxt != IPPROTO_ICMPV6) 1267 iph_hdr_length = ip_hdr_length_v6(mp, outer_ip6h); 1268 else 1269 iph_hdr_length = IPV6_HDR_LEN; 1270 ASSERT(iph_hdr_length == ira->ira_ip_hdr_length); 1271 #endif 1272 /* Skip past the outer IP and ICMP headers */ 1273 iph_hdr_length = ira->ira_ip_hdr_length; 1274 icmp6 = (icmp6_t *)&mp->b_rptr[iph_hdr_length]; 1275 1276 ip6h = (ip6_t *)&icmp6[1]; /* Inner IP header */ 1277 if (!ip_hdr_length_nexthdr_v6(mp, ip6h, &iph_hdr_length, &nexthdrp)) { 1278 freemsg(mp); 1279 return; 1280 } 1281 1282 switch (icmp6->icmp6_type) { 1283 case ICMP6_DST_UNREACH: 1284 switch (icmp6->icmp6_code) { 1285 case ICMP6_DST_UNREACH_NOPORT: 1286 error = ECONNREFUSED; 1287 break; 1288 case ICMP6_DST_UNREACH_ADMIN: 1289 case ICMP6_DST_UNREACH_NOROUTE: 1290 case ICMP6_DST_UNREACH_BEYONDSCOPE: 1291 case ICMP6_DST_UNREACH_ADDR: 1292 /* Transient errors */ 1293 break; 1294 default: 1295 break; 1296 } 1297 break; 1298 case ICMP6_PACKET_TOO_BIG: { 1299 struct T_unitdata_ind *tudi; 1300 struct T_opthdr *toh; 1301 size_t udi_size; 1302 mblk_t *newmp; 1303 t_scalar_t opt_length = sizeof (struct T_opthdr) + 1304 sizeof (struct ip6_mtuinfo); 1305 sin6_t *sin6; 1306 struct ip6_mtuinfo *mtuinfo; 1307 1308 /* 1309 * If the application has requested to receive path mtu 1310 * information, send up an empty message containing an 1311 * IPV6_PATHMTU ancillary data item. 1312 */ 1313 if (!connp->conn_ipv6_recvpathmtu) 1314 break; 1315 1316 udi_size = sizeof (struct T_unitdata_ind) + sizeof (sin6_t) + 1317 opt_length; 1318 if ((newmp = allocb(udi_size, BPRI_MED)) == NULL) { 1319 BUMP_MIB(&icmp->icmp_is->is_rawip_mib, rawipInErrors); 1320 break; 1321 } 1322 1323 /* 1324 * newmp->b_cont is left to NULL on purpose. This is an 1325 * empty message containing only ancillary data. 1326 */ 1327 newmp->b_datap->db_type = M_PROTO; 1328 tudi = (struct T_unitdata_ind *)newmp->b_rptr; 1329 newmp->b_wptr = (uchar_t *)tudi + udi_size; 1330 tudi->PRIM_type = T_UNITDATA_IND; 1331 tudi->SRC_length = sizeof (sin6_t); 1332 tudi->SRC_offset = sizeof (struct T_unitdata_ind); 1333 tudi->OPT_offset = tudi->SRC_offset + sizeof (sin6_t); 1334 tudi->OPT_length = opt_length; 1335 1336 sin6 = (sin6_t *)&tudi[1]; 1337 bzero(sin6, sizeof (sin6_t)); 1338 sin6->sin6_family = AF_INET6; 1339 sin6->sin6_addr = connp->conn_faddr_v6; 1340 1341 toh = (struct T_opthdr *)&sin6[1]; 1342 toh->level = IPPROTO_IPV6; 1343 toh->name = IPV6_PATHMTU; 1344 toh->len = opt_length; 1345 toh->status = 0; 1346 1347 mtuinfo = (struct ip6_mtuinfo *)&toh[1]; 1348 bzero(mtuinfo, sizeof (struct ip6_mtuinfo)); 1349 mtuinfo->ip6m_addr.sin6_family = AF_INET6; 1350 mtuinfo->ip6m_addr.sin6_addr = ip6h->ip6_dst; 1351 mtuinfo->ip6m_mtu = icmp6->icmp6_mtu; 1352 /* 1353 * We've consumed everything we need from the original 1354 * message. Free it, then send our empty message. 1355 */ 1356 freemsg(mp); 1357 icmp_ulp_recv(connp, newmp, msgdsize(newmp)); 1358 return; 1359 } 1360 case ICMP6_TIME_EXCEEDED: 1361 /* Transient errors */ 1362 break; 1363 case ICMP6_PARAM_PROB: 1364 /* If this corresponds to an ICMP_PROTOCOL_UNREACHABLE */ 1365 if (icmp6->icmp6_code == ICMP6_PARAMPROB_NEXTHEADER && 1366 (uchar_t *)ip6h + icmp6->icmp6_pptr == 1367 (uchar_t *)nexthdrp) { 1368 error = ECONNREFUSED; 1369 break; 1370 } 1371 break; 1372 } 1373 if (error == 0) { 1374 freemsg(mp); 1375 return; 1376 } 1377 1378 /* 1379 * Deliver T_UDERROR_IND when the application has asked for it. 1380 * The socket layer enables this automatically when connected. 1381 */ 1382 if (!connp->conn_dgram_errind) { 1383 freemsg(mp); 1384 return; 1385 } 1386 1387 sin6 = sin6_null; 1388 sin6.sin6_family = AF_INET6; 1389 sin6.sin6_addr = ip6h->ip6_dst; 1390 sin6.sin6_flowinfo = ip6h->ip6_vcf & ~IPV6_VERS_AND_FLOW_MASK; 1391 if (IPCL_IS_NONSTR(connp)) { 1392 mutex_enter(&connp->conn_lock); 1393 if (icmp->icmp_state == TS_DATA_XFER) { 1394 if (IN6_ARE_ADDR_EQUAL(&sin6.sin6_addr, 1395 &connp->conn_faddr_v6)) { 1396 mutex_exit(&connp->conn_lock); 1397 (*connp->conn_upcalls->su_set_error) 1398 (connp->conn_upper_handle, error); 1399 goto done; 1400 } 1401 } else { 1402 icmp->icmp_delayed_error = error; 1403 *((sin6_t *)&icmp->icmp_delayed_addr) = sin6; 1404 } 1405 mutex_exit(&connp->conn_lock); 1406 } else { 1407 mp1 = mi_tpi_uderror_ind((char *)&sin6, sizeof (sin6_t), 1408 NULL, 0, error); 1409 if (mp1 != NULL) 1410 putnext(connp->conn_rq, mp1); 1411 } 1412 done: 1413 freemsg(mp); 1414 } 1415 1416 /* 1417 * This routine responds to T_ADDR_REQ messages. It is called by icmp_wput. 1418 * The local address is filled in if endpoint is bound. The remote address 1419 * is filled in if remote address has been precified ("connected endpoint") 1420 * (The concept of connected CLTS sockets is alien to published TPI 1421 * but we support it anyway). 1422 */ 1423 static void 1424 icmp_addr_req(queue_t *q, mblk_t *mp) 1425 { 1426 struct sockaddr *sa; 1427 mblk_t *ackmp; 1428 struct T_addr_ack *taa; 1429 icmp_t *icmp = Q_TO_ICMP(q); 1430 conn_t *connp = icmp->icmp_connp; 1431 uint_t addrlen; 1432 1433 /* Make it large enough for worst case */ 1434 ackmp = reallocb(mp, sizeof (struct T_addr_ack) + 1435 2 * sizeof (sin6_t), 1); 1436 if (ackmp == NULL) { 1437 icmp_err_ack(q, mp, TSYSERR, ENOMEM); 1438 return; 1439 } 1440 taa = (struct T_addr_ack *)ackmp->b_rptr; 1441 1442 bzero(taa, sizeof (struct T_addr_ack)); 1443 ackmp->b_wptr = (uchar_t *)&taa[1]; 1444 1445 taa->PRIM_type = T_ADDR_ACK; 1446 ackmp->b_datap->db_type = M_PCPROTO; 1447 1448 if (connp->conn_family == AF_INET) 1449 addrlen = sizeof (sin_t); 1450 else 1451 addrlen = sizeof (sin6_t); 1452 1453 mutex_enter(&connp->conn_lock); 1454 /* 1455 * Note: Following code assumes 32 bit alignment of basic 1456 * data structures like sin_t and struct T_addr_ack. 1457 */ 1458 if (icmp->icmp_state != TS_UNBND) { 1459 /* 1460 * Fill in local address first 1461 */ 1462 taa->LOCADDR_offset = sizeof (*taa); 1463 taa->LOCADDR_length = addrlen; 1464 sa = (struct sockaddr *)&taa[1]; 1465 (void) conn_getsockname(connp, sa, &addrlen); 1466 ackmp->b_wptr += addrlen; 1467 } 1468 if (icmp->icmp_state == TS_DATA_XFER) { 1469 /* 1470 * connected, fill remote address too 1471 */ 1472 taa->REMADDR_length = addrlen; 1473 /* assumed 32-bit alignment */ 1474 taa->REMADDR_offset = taa->LOCADDR_offset + taa->LOCADDR_length; 1475 sa = (struct sockaddr *)(ackmp->b_rptr + taa->REMADDR_offset); 1476 (void) conn_getpeername(connp, sa, &addrlen); 1477 ackmp->b_wptr += addrlen; 1478 } 1479 mutex_exit(&connp->conn_lock); 1480 ASSERT(ackmp->b_wptr <= ackmp->b_datap->db_lim); 1481 qreply(q, ackmp); 1482 } 1483 1484 static void 1485 icmp_copy_info(struct T_info_ack *tap, icmp_t *icmp) 1486 { 1487 conn_t *connp = icmp->icmp_connp; 1488 1489 *tap = icmp_g_t_info_ack; 1490 1491 if (connp->conn_family == AF_INET6) 1492 tap->ADDR_size = sizeof (sin6_t); 1493 else 1494 tap->ADDR_size = sizeof (sin_t); 1495 tap->CURRENT_state = icmp->icmp_state; 1496 tap->OPT_size = icmp_max_optsize; 1497 } 1498 1499 static void 1500 icmp_do_capability_ack(icmp_t *icmp, struct T_capability_ack *tcap, 1501 t_uscalar_t cap_bits1) 1502 { 1503 tcap->CAP_bits1 = 0; 1504 1505 if (cap_bits1 & TC1_INFO) { 1506 icmp_copy_info(&tcap->INFO_ack, icmp); 1507 tcap->CAP_bits1 |= TC1_INFO; 1508 } 1509 } 1510 1511 /* 1512 * This routine responds to T_CAPABILITY_REQ messages. It is called by 1513 * icmp_wput. Much of the T_CAPABILITY_ACK information is copied from 1514 * icmp_g_t_info_ack. The current state of the stream is copied from 1515 * icmp_state. 1516 */ 1517 static void 1518 icmp_capability_req(queue_t *q, mblk_t *mp) 1519 { 1520 icmp_t *icmp = Q_TO_ICMP(q); 1521 t_uscalar_t cap_bits1; 1522 struct T_capability_ack *tcap; 1523 1524 cap_bits1 = ((struct T_capability_req *)mp->b_rptr)->CAP_bits1; 1525 1526 mp = tpi_ack_alloc(mp, sizeof (struct T_capability_ack), 1527 mp->b_datap->db_type, T_CAPABILITY_ACK); 1528 if (!mp) 1529 return; 1530 1531 tcap = (struct T_capability_ack *)mp->b_rptr; 1532 1533 icmp_do_capability_ack(icmp, tcap, cap_bits1); 1534 1535 qreply(q, mp); 1536 } 1537 1538 /* 1539 * This routine responds to T_INFO_REQ messages. It is called by icmp_wput. 1540 * Most of the T_INFO_ACK information is copied from icmp_g_t_info_ack. 1541 * The current state of the stream is copied from icmp_state. 1542 */ 1543 static void 1544 icmp_info_req(queue_t *q, mblk_t *mp) 1545 { 1546 icmp_t *icmp = Q_TO_ICMP(q); 1547 1548 /* Create a T_INFO_ACK message. */ 1549 mp = tpi_ack_alloc(mp, sizeof (struct T_info_ack), M_PCPROTO, 1550 T_INFO_ACK); 1551 if (!mp) 1552 return; 1553 icmp_copy_info((struct T_info_ack *)mp->b_rptr, icmp); 1554 qreply(q, mp); 1555 } 1556 1557 static int 1558 icmp_tpi_open(queue_t *q, dev_t *devp, int flag, int sflag, cred_t *credp, 1559 int family) 1560 { 1561 conn_t *connp; 1562 dev_t conn_dev; 1563 int error; 1564 1565 /* If the stream is already open, return immediately. */ 1566 if (q->q_ptr != NULL) 1567 return (0); 1568 1569 if (sflag == MODOPEN) 1570 return (EINVAL); 1571 1572 /* 1573 * Since ICMP is not used so heavily, allocating from the small 1574 * arena should be sufficient. 1575 */ 1576 if ((conn_dev = inet_minor_alloc(ip_minor_arena_sa)) == 0) { 1577 return (EBUSY); 1578 } 1579 1580 if (flag & SO_FALLBACK) { 1581 /* 1582 * Non streams socket needs a stream to fallback to 1583 */ 1584 RD(q)->q_ptr = (void *)conn_dev; 1585 WR(q)->q_qinfo = &icmp_fallback_sock_winit; 1586 WR(q)->q_ptr = (void *)ip_minor_arena_sa; 1587 qprocson(q); 1588 return (0); 1589 } 1590 1591 connp = rawip_do_open(family, credp, &error, KM_SLEEP); 1592 if (connp == NULL) { 1593 ASSERT(error != 0); 1594 inet_minor_free(ip_minor_arena_sa, connp->conn_dev); 1595 return (error); 1596 } 1597 1598 *devp = makedevice(getemajor(*devp), (minor_t)conn_dev); 1599 connp->conn_dev = conn_dev; 1600 connp->conn_minor_arena = ip_minor_arena_sa; 1601 1602 /* 1603 * Initialize the icmp_t structure for this stream. 1604 */ 1605 q->q_ptr = connp; 1606 WR(q)->q_ptr = connp; 1607 connp->conn_rq = q; 1608 connp->conn_wq = WR(q); 1609 1610 WR(q)->q_hiwat = connp->conn_sndbuf; 1611 WR(q)->q_lowat = connp->conn_sndlowat; 1612 1613 qprocson(q); 1614 1615 /* Set the Stream head write offset. */ 1616 (void) proto_set_tx_wroff(q, connp, connp->conn_wroff); 1617 (void) proto_set_rx_hiwat(connp->conn_rq, connp, connp->conn_rcvbuf); 1618 1619 mutex_enter(&connp->conn_lock); 1620 connp->conn_state_flags &= ~CONN_INCIPIENT; 1621 mutex_exit(&connp->conn_lock); 1622 1623 icmp_bind_proto(connp->conn_icmp); 1624 1625 return (0); 1626 } 1627 1628 /* For /dev/icmp aka AF_INET open */ 1629 static int 1630 icmp_openv4(queue_t *q, dev_t *devp, int flag, int sflag, cred_t *credp) 1631 { 1632 return (icmp_tpi_open(q, devp, flag, sflag, credp, AF_INET)); 1633 } 1634 1635 /* For /dev/icmp6 aka AF_INET6 open */ 1636 static int 1637 icmp_openv6(queue_t *q, dev_t *devp, int flag, int sflag, cred_t *credp) 1638 { 1639 return (icmp_tpi_open(q, devp, flag, sflag, credp, AF_INET6)); 1640 } 1641 1642 /* 1643 * This is the open routine for icmp. It allocates a icmp_t structure for 1644 * the stream and, on the first open of the module, creates an ND table. 1645 */ 1646 static conn_t * 1647 rawip_do_open(int family, cred_t *credp, int *err, int flags) 1648 { 1649 icmp_t *icmp; 1650 conn_t *connp; 1651 zoneid_t zoneid; 1652 netstack_t *ns; 1653 icmp_stack_t *is; 1654 int len; 1655 boolean_t isv6 = B_FALSE; 1656 1657 *err = secpolicy_net_icmpaccess(credp); 1658 if (*err != 0) 1659 return (NULL); 1660 1661 if (family == AF_INET6) 1662 isv6 = B_TRUE; 1663 1664 ns = netstack_find_by_cred(credp); 1665 ASSERT(ns != NULL); 1666 is = ns->netstack_icmp; 1667 ASSERT(is != NULL); 1668 1669 /* 1670 * For exclusive stacks we set the zoneid to zero 1671 * to make ICMP operate as if in the global zone. 1672 */ 1673 if (ns->netstack_stackid != GLOBAL_NETSTACKID) 1674 zoneid = GLOBAL_ZONEID; 1675 else 1676 zoneid = crgetzoneid(credp); 1677 1678 ASSERT(flags == KM_SLEEP || flags == KM_NOSLEEP); 1679 1680 connp = ipcl_conn_create(IPCL_RAWIPCONN, flags, ns); 1681 icmp = connp->conn_icmp; 1682 1683 /* 1684 * ipcl_conn_create did a netstack_hold. Undo the hold that was 1685 * done by netstack_find_by_cred() 1686 */ 1687 netstack_rele(ns); 1688 1689 /* 1690 * Since this conn_t/icmp_t is not yet visible to anybody else we don't 1691 * need to lock anything. 1692 */ 1693 ASSERT(connp->conn_proto == IPPROTO_ICMP); 1694 ASSERT(connp->conn_icmp == icmp); 1695 ASSERT(icmp->icmp_connp == connp); 1696 1697 /* Set the initial state of the stream and the privilege status. */ 1698 icmp->icmp_state = TS_UNBND; 1699 connp->conn_ixa->ixa_flags |= IXAF_VERIFY_SOURCE; 1700 if (isv6) { 1701 connp->conn_family = AF_INET6; 1702 connp->conn_ipversion = IPV6_VERSION; 1703 connp->conn_ixa->ixa_flags &= ~IXAF_IS_IPV4; 1704 connp->conn_proto = IPPROTO_ICMPV6; 1705 /* May be changed by a SO_PROTOTYPE socket option. */ 1706 connp->conn_proto = IPPROTO_ICMPV6; 1707 connp->conn_ixa->ixa_protocol = connp->conn_proto; 1708 connp->conn_ixa->ixa_raw_cksum_offset = 2; 1709 connp->conn_default_ttl = is->is_ipv6_hoplimit; 1710 len = sizeof (ip6_t); 1711 } else { 1712 connp->conn_family = AF_INET; 1713 connp->conn_ipversion = IPV4_VERSION; 1714 connp->conn_ixa->ixa_flags |= IXAF_IS_IPV4; 1715 /* May be changed by a SO_PROTOTYPE socket option. */ 1716 connp->conn_proto = IPPROTO_ICMP; 1717 connp->conn_ixa->ixa_protocol = connp->conn_proto; 1718 connp->conn_default_ttl = is->is_ipv4_ttl; 1719 len = sizeof (ipha_t); 1720 } 1721 connp->conn_xmit_ipp.ipp_unicast_hops = connp->conn_default_ttl; 1722 1723 connp->conn_ixa->ixa_multicast_ttl = IP_DEFAULT_MULTICAST_TTL; 1724 1725 /* 1726 * For the socket of protocol IPPROTO_RAW or when IP_HDRINCL is set, 1727 * the checksum is provided in the pre-built packet. We clear 1728 * IXAF_SET_ULP_CKSUM to tell IP that the application has sent a 1729 * complete IP header and not to compute the transport checksum. 1730 */ 1731 connp->conn_ixa->ixa_flags |= IXAF_MULTICAST_LOOP | IXAF_SET_ULP_CKSUM; 1732 /* conn_allzones can not be set this early, hence no IPCL_ZONEID */ 1733 connp->conn_ixa->ixa_zoneid = zoneid; 1734 1735 connp->conn_zoneid = zoneid; 1736 1737 /* 1738 * If the caller has the process-wide flag set, then default to MAC 1739 * exempt mode. This allows read-down to unlabeled hosts. 1740 */ 1741 if (getpflags(NET_MAC_AWARE, credp) != 0) 1742 connp->conn_mac_mode = CONN_MAC_AWARE; 1743 1744 connp->conn_zone_is_global = (crgetzoneid(credp) == GLOBAL_ZONEID); 1745 1746 icmp->icmp_is = is; 1747 1748 connp->conn_rcvbuf = is->is_recv_hiwat; 1749 connp->conn_sndbuf = is->is_xmit_hiwat; 1750 connp->conn_sndlowat = is->is_xmit_lowat; 1751 connp->conn_rcvlowat = icmp_mod_info.mi_lowat; 1752 1753 connp->conn_wroff = len + is->is_wroff_extra; 1754 connp->conn_so_type = SOCK_RAW; 1755 1756 connp->conn_recv = icmp_input; 1757 connp->conn_recvicmp = icmp_icmp_input; 1758 crhold(credp); 1759 connp->conn_cred = credp; 1760 connp->conn_cpid = curproc->p_pid; 1761 connp->conn_open_time = ddi_get_lbolt64(); 1762 /* Cache things in ixa without an extra refhold */ 1763 ASSERT(!(connp->conn_ixa->ixa_free_flags & IXA_FREE_CRED)); 1764 connp->conn_ixa->ixa_cred = connp->conn_cred; 1765 connp->conn_ixa->ixa_cpid = connp->conn_cpid; 1766 if (is_system_labeled()) 1767 connp->conn_ixa->ixa_tsl = crgetlabel(connp->conn_cred); 1768 1769 connp->conn_flow_cntrld = B_FALSE; 1770 1771 if (is->is_pmtu_discovery) 1772 connp->conn_ixa->ixa_flags |= IXAF_PMTU_DISCOVERY; 1773 1774 return (connp); 1775 } 1776 1777 /* 1778 * Which ICMP options OK to set through T_UNITDATA_REQ... 1779 */ 1780 /* ARGSUSED */ 1781 static boolean_t 1782 icmp_opt_allow_udr_set(t_scalar_t level, t_scalar_t name) 1783 { 1784 return (B_TRUE); 1785 } 1786 1787 /* 1788 * This routine gets default values of certain options whose default 1789 * values are maintained by protcol specific code 1790 */ 1791 int 1792 icmp_opt_default(queue_t *q, t_scalar_t level, t_scalar_t name, uchar_t *ptr) 1793 { 1794 icmp_t *icmp = Q_TO_ICMP(q); 1795 icmp_stack_t *is = icmp->icmp_is; 1796 int *i1 = (int *)ptr; 1797 1798 switch (level) { 1799 case IPPROTO_IP: 1800 switch (name) { 1801 case IP_MULTICAST_TTL: 1802 *ptr = (uchar_t)IP_DEFAULT_MULTICAST_TTL; 1803 return (sizeof (uchar_t)); 1804 case IP_MULTICAST_LOOP: 1805 *ptr = (uchar_t)IP_DEFAULT_MULTICAST_LOOP; 1806 return (sizeof (uchar_t)); 1807 } 1808 break; 1809 case IPPROTO_IPV6: 1810 switch (name) { 1811 case IPV6_MULTICAST_HOPS: 1812 *i1 = IP_DEFAULT_MULTICAST_TTL; 1813 return (sizeof (int)); 1814 case IPV6_MULTICAST_LOOP: 1815 *i1 = IP_DEFAULT_MULTICAST_LOOP; 1816 return (sizeof (int)); 1817 case IPV6_UNICAST_HOPS: 1818 *i1 = is->is_ipv6_hoplimit; 1819 return (sizeof (int)); 1820 } 1821 break; 1822 case IPPROTO_ICMPV6: 1823 switch (name) { 1824 case ICMP6_FILTER: 1825 /* Make it look like "pass all" */ 1826 ICMP6_FILTER_SETPASSALL((icmp6_filter_t *)ptr); 1827 return (sizeof (icmp6_filter_t)); 1828 } 1829 break; 1830 } 1831 return (-1); 1832 } 1833 1834 /* 1835 * This routine retrieves the current status of socket options. 1836 * It returns the size of the option retrieved, or -1. 1837 */ 1838 int 1839 icmp_opt_get(conn_t *connp, int level, int name, uchar_t *ptr) 1840 { 1841 icmp_t *icmp = connp->conn_icmp; 1842 int *i1 = (int *)ptr; 1843 conn_opt_arg_t coas; 1844 int retval; 1845 1846 coas.coa_connp = connp; 1847 coas.coa_ixa = connp->conn_ixa; 1848 coas.coa_ipp = &connp->conn_xmit_ipp; 1849 coas.coa_ancillary = B_FALSE; 1850 coas.coa_changed = 0; 1851 1852 /* 1853 * We assume that the optcom framework has checked for the set 1854 * of levels and names that are supported, hence we don't worry 1855 * about rejecting based on that. 1856 * First check for ICMP specific handling, then pass to common routine. 1857 */ 1858 switch (level) { 1859 case IPPROTO_IP: 1860 /* 1861 * Only allow IPv4 option processing on IPv4 sockets. 1862 */ 1863 if (connp->conn_family != AF_INET) 1864 return (-1); 1865 1866 switch (name) { 1867 case IP_OPTIONS: 1868 case T_IP_OPTIONS: 1869 /* Options are passed up with each packet */ 1870 return (0); 1871 case IP_HDRINCL: 1872 mutex_enter(&connp->conn_lock); 1873 *i1 = (int)icmp->icmp_hdrincl; 1874 mutex_exit(&connp->conn_lock); 1875 return (sizeof (int)); 1876 } 1877 break; 1878 1879 case IPPROTO_IPV6: 1880 /* 1881 * Only allow IPv6 option processing on native IPv6 sockets. 1882 */ 1883 if (connp->conn_family != AF_INET6) 1884 return (-1); 1885 1886 switch (name) { 1887 case IPV6_CHECKSUM: 1888 /* 1889 * Return offset or -1 if no checksum offset. 1890 * Does not apply to IPPROTO_ICMPV6 1891 */ 1892 if (connp->conn_proto == IPPROTO_ICMPV6) 1893 return (-1); 1894 1895 mutex_enter(&connp->conn_lock); 1896 if (connp->conn_ixa->ixa_flags & IXAF_SET_RAW_CKSUM) 1897 *i1 = connp->conn_ixa->ixa_raw_cksum_offset; 1898 else 1899 *i1 = -1; 1900 mutex_exit(&connp->conn_lock); 1901 return (sizeof (int)); 1902 } 1903 break; 1904 1905 case IPPROTO_ICMPV6: 1906 /* 1907 * Only allow IPv6 option processing on native IPv6 sockets. 1908 */ 1909 if (connp->conn_family != AF_INET6) 1910 return (-1); 1911 1912 if (connp->conn_proto != IPPROTO_ICMPV6) 1913 return (-1); 1914 1915 switch (name) { 1916 case ICMP6_FILTER: 1917 mutex_enter(&connp->conn_lock); 1918 if (icmp->icmp_filter == NULL) { 1919 /* Make it look like "pass all" */ 1920 ICMP6_FILTER_SETPASSALL((icmp6_filter_t *)ptr); 1921 } else { 1922 (void) bcopy(icmp->icmp_filter, ptr, 1923 sizeof (icmp6_filter_t)); 1924 } 1925 mutex_exit(&connp->conn_lock); 1926 return (sizeof (icmp6_filter_t)); 1927 } 1928 } 1929 mutex_enter(&connp->conn_lock); 1930 retval = conn_opt_get(&coas, level, name, ptr); 1931 mutex_exit(&connp->conn_lock); 1932 return (retval); 1933 } 1934 1935 /* 1936 * This routine retrieves the current status of socket options. 1937 * It returns the size of the option retrieved, or -1. 1938 */ 1939 int 1940 icmp_tpi_opt_get(queue_t *q, int level, int name, uchar_t *ptr) 1941 { 1942 conn_t *connp = Q_TO_CONN(q); 1943 int err; 1944 1945 err = icmp_opt_get(connp, level, name, ptr); 1946 return (err); 1947 } 1948 1949 /* 1950 * This routine sets socket options. 1951 */ 1952 int 1953 icmp_do_opt_set(conn_opt_arg_t *coa, int level, int name, 1954 uint_t inlen, uchar_t *invalp, cred_t *cr, boolean_t checkonly) 1955 { 1956 conn_t *connp = coa->coa_connp; 1957 ip_xmit_attr_t *ixa = coa->coa_ixa; 1958 icmp_t *icmp = connp->conn_icmp; 1959 icmp_stack_t *is = icmp->icmp_is; 1960 int *i1 = (int *)invalp; 1961 boolean_t onoff = (*i1 == 0) ? 0 : 1; 1962 int error; 1963 1964 ASSERT(MUTEX_NOT_HELD(&coa->coa_connp->conn_lock)); 1965 1966 /* 1967 * For fixed length options, no sanity check 1968 * of passed in length is done. It is assumed *_optcom_req() 1969 * routines do the right thing. 1970 */ 1971 1972 switch (level) { 1973 case SOL_SOCKET: 1974 switch (name) { 1975 case SO_PROTOTYPE: 1976 if ((*i1 & 0xFF) != IPPROTO_ICMP && 1977 (*i1 & 0xFF) != IPPROTO_ICMPV6 && 1978 secpolicy_net_rawaccess(cr) != 0) { 1979 return (EACCES); 1980 } 1981 if (checkonly) 1982 break; 1983 1984 mutex_enter(&connp->conn_lock); 1985 connp->conn_proto = *i1 & 0xFF; 1986 ixa->ixa_protocol = connp->conn_proto; 1987 if ((connp->conn_proto == IPPROTO_RAW || 1988 connp->conn_proto == IPPROTO_IGMP) && 1989 connp->conn_family == AF_INET) { 1990 icmp->icmp_hdrincl = 1; 1991 ixa->ixa_flags &= ~IXAF_SET_ULP_CKSUM; 1992 } else if (connp->conn_proto == IPPROTO_UDP || 1993 connp->conn_proto == IPPROTO_TCP || 1994 connp->conn_proto == IPPROTO_SCTP) { 1995 /* Used by test applications like psh */ 1996 icmp->icmp_hdrincl = 0; 1997 ixa->ixa_flags &= ~IXAF_SET_ULP_CKSUM; 1998 } else { 1999 icmp->icmp_hdrincl = 0; 2000 ixa->ixa_flags |= IXAF_SET_ULP_CKSUM; 2001 } 2002 2003 if (connp->conn_family == AF_INET6 && 2004 connp->conn_proto == IPPROTO_ICMPV6) { 2005 /* Set offset for icmp6_cksum */ 2006 ixa->ixa_flags &= ~IXAF_SET_RAW_CKSUM; 2007 ixa->ixa_raw_cksum_offset = 2; 2008 } 2009 if (icmp->icmp_filter != NULL && 2010 connp->conn_proto != IPPROTO_ICMPV6) { 2011 kmem_free(icmp->icmp_filter, 2012 sizeof (icmp6_filter_t)); 2013 icmp->icmp_filter = NULL; 2014 } 2015 mutex_exit(&connp->conn_lock); 2016 2017 coa->coa_changed |= COA_HEADER_CHANGED; 2018 /* 2019 * For SCTP, we don't use icmp_bind_proto() for 2020 * raw socket binding. 2021 */ 2022 if (connp->conn_proto == IPPROTO_SCTP) 2023 return (0); 2024 2025 coa->coa_changed |= COA_ICMP_BIND_NEEDED; 2026 return (0); 2027 2028 case SO_SNDBUF: 2029 if (*i1 > is->is_max_buf) { 2030 return (ENOBUFS); 2031 } 2032 break; 2033 case SO_RCVBUF: 2034 if (*i1 > is->is_max_buf) { 2035 return (ENOBUFS); 2036 } 2037 break; 2038 } 2039 break; 2040 2041 case IPPROTO_IP: 2042 /* 2043 * Only allow IPv4 option processing on IPv4 sockets. 2044 */ 2045 if (connp->conn_family != AF_INET) 2046 return (EINVAL); 2047 2048 switch (name) { 2049 case IP_HDRINCL: 2050 if (!checkonly) { 2051 mutex_enter(&connp->conn_lock); 2052 icmp->icmp_hdrincl = onoff; 2053 if (onoff) 2054 ixa->ixa_flags &= ~IXAF_SET_ULP_CKSUM; 2055 else 2056 ixa->ixa_flags |= IXAF_SET_ULP_CKSUM; 2057 mutex_exit(&connp->conn_lock); 2058 } 2059 break; 2060 } 2061 break; 2062 2063 case IPPROTO_IPV6: 2064 if (connp->conn_family != AF_INET6) 2065 return (EINVAL); 2066 2067 switch (name) { 2068 case IPV6_CHECKSUM: 2069 /* 2070 * Integer offset into the user data of where the 2071 * checksum is located. 2072 * Offset of -1 disables option. 2073 * Does not apply to IPPROTO_ICMPV6. 2074 */ 2075 if (connp->conn_proto == IPPROTO_ICMPV6 || 2076 coa->coa_ancillary) { 2077 return (EINVAL); 2078 } 2079 if ((*i1 != -1) && ((*i1 < 0) || (*i1 & 0x1) != 0)) { 2080 /* Negative or not 16 bit aligned offset */ 2081 return (EINVAL); 2082 } 2083 if (checkonly) 2084 break; 2085 2086 mutex_enter(&connp->conn_lock); 2087 if (*i1 == -1) { 2088 ixa->ixa_flags &= ~IXAF_SET_RAW_CKSUM; 2089 ixa->ixa_raw_cksum_offset = 0; 2090 ixa->ixa_flags &= ~IXAF_SET_ULP_CKSUM; 2091 } else { 2092 ixa->ixa_flags |= IXAF_SET_RAW_CKSUM; 2093 ixa->ixa_raw_cksum_offset = *i1; 2094 ixa->ixa_flags |= IXAF_SET_ULP_CKSUM; 2095 } 2096 mutex_exit(&connp->conn_lock); 2097 break; 2098 } 2099 break; 2100 2101 case IPPROTO_ICMPV6: 2102 /* 2103 * Only allow IPv6 option processing on IPv6 sockets. 2104 */ 2105 if (connp->conn_family != AF_INET6) 2106 return (EINVAL); 2107 if (connp->conn_proto != IPPROTO_ICMPV6) 2108 return (EINVAL); 2109 2110 switch (name) { 2111 case ICMP6_FILTER: 2112 if (checkonly) 2113 break; 2114 2115 if ((inlen != 0) && 2116 (inlen != sizeof (icmp6_filter_t))) 2117 return (EINVAL); 2118 2119 mutex_enter(&connp->conn_lock); 2120 if (inlen == 0) { 2121 if (icmp->icmp_filter != NULL) { 2122 kmem_free(icmp->icmp_filter, 2123 sizeof (icmp6_filter_t)); 2124 icmp->icmp_filter = NULL; 2125 } 2126 } else { 2127 if (icmp->icmp_filter == NULL) { 2128 icmp->icmp_filter = kmem_alloc( 2129 sizeof (icmp6_filter_t), 2130 KM_NOSLEEP); 2131 if (icmp->icmp_filter == NULL) { 2132 mutex_exit(&connp->conn_lock); 2133 return (ENOBUFS); 2134 } 2135 } 2136 (void) bcopy(invalp, icmp->icmp_filter, inlen); 2137 } 2138 mutex_exit(&connp->conn_lock); 2139 break; 2140 } 2141 break; 2142 } 2143 error = conn_opt_set(coa, level, name, inlen, invalp, 2144 checkonly, cr); 2145 return (error); 2146 } 2147 2148 /* 2149 * This routine sets socket options. 2150 */ 2151 int 2152 icmp_opt_set(conn_t *connp, uint_t optset_context, int level, int name, 2153 uint_t inlen, uchar_t *invalp, uint_t *outlenp, uchar_t *outvalp, 2154 void *thisdg_attrs, cred_t *cr) 2155 { 2156 icmp_t *icmp = connp->conn_icmp; 2157 int err; 2158 conn_opt_arg_t coas, *coa; 2159 boolean_t checkonly; 2160 icmp_stack_t *is = icmp->icmp_is; 2161 2162 switch (optset_context) { 2163 case SETFN_OPTCOM_CHECKONLY: 2164 checkonly = B_TRUE; 2165 /* 2166 * Note: Implies T_CHECK semantics for T_OPTCOM_REQ 2167 * inlen != 0 implies value supplied and 2168 * we have to "pretend" to set it. 2169 * inlen == 0 implies that there is no 2170 * value part in T_CHECK request and just validation 2171 * done elsewhere should be enough, we just return here. 2172 */ 2173 if (inlen == 0) { 2174 *outlenp = 0; 2175 return (0); 2176 } 2177 break; 2178 case SETFN_OPTCOM_NEGOTIATE: 2179 checkonly = B_FALSE; 2180 break; 2181 case SETFN_UD_NEGOTIATE: 2182 case SETFN_CONN_NEGOTIATE: 2183 checkonly = B_FALSE; 2184 /* 2185 * Negotiating local and "association-related" options 2186 * through T_UNITDATA_REQ. 2187 * 2188 * Following routine can filter out ones we do not 2189 * want to be "set" this way. 2190 */ 2191 if (!icmp_opt_allow_udr_set(level, name)) { 2192 *outlenp = 0; 2193 return (EINVAL); 2194 } 2195 break; 2196 default: 2197 /* 2198 * We should never get here 2199 */ 2200 *outlenp = 0; 2201 return (EINVAL); 2202 } 2203 2204 ASSERT((optset_context != SETFN_OPTCOM_CHECKONLY) || 2205 (optset_context == SETFN_OPTCOM_CHECKONLY && inlen != 0)); 2206 2207 if (thisdg_attrs != NULL) { 2208 /* Options from T_UNITDATA_REQ */ 2209 coa = (conn_opt_arg_t *)thisdg_attrs; 2210 ASSERT(coa->coa_connp == connp); 2211 ASSERT(coa->coa_ixa != NULL); 2212 ASSERT(coa->coa_ipp != NULL); 2213 ASSERT(coa->coa_ancillary); 2214 } else { 2215 coa = &coas; 2216 coas.coa_connp = connp; 2217 /* Get a reference on conn_ixa to prevent concurrent mods */ 2218 coas.coa_ixa = conn_get_ixa(connp, B_TRUE); 2219 if (coas.coa_ixa == NULL) { 2220 *outlenp = 0; 2221 return (ENOMEM); 2222 } 2223 coas.coa_ipp = &connp->conn_xmit_ipp; 2224 coas.coa_ancillary = B_FALSE; 2225 coas.coa_changed = 0; 2226 } 2227 2228 err = icmp_do_opt_set(coa, level, name, inlen, invalp, 2229 cr, checkonly); 2230 if (err != 0) { 2231 errout: 2232 if (!coa->coa_ancillary) 2233 ixa_refrele(coa->coa_ixa); 2234 *outlenp = 0; 2235 return (err); 2236 } 2237 2238 /* 2239 * Common case of OK return with outval same as inval. 2240 */ 2241 if (invalp != outvalp) { 2242 /* don't trust bcopy for identical src/dst */ 2243 (void) bcopy(invalp, outvalp, inlen); 2244 } 2245 *outlenp = inlen; 2246 2247 /* 2248 * If this was not ancillary data, then we rebuild the headers, 2249 * update the IRE/NCE, and IPsec as needed. 2250 * Since the label depends on the destination we go through 2251 * ip_set_destination first. 2252 */ 2253 if (coa->coa_ancillary) { 2254 return (0); 2255 } 2256 2257 if (coa->coa_changed & COA_ROUTE_CHANGED) { 2258 in6_addr_t saddr, faddr, nexthop; 2259 in_port_t fport; 2260 2261 /* 2262 * We clear lastdst to make sure we pick up the change 2263 * next time sending. 2264 * If we are connected we re-cache the information. 2265 * We ignore errors to preserve BSD behavior. 2266 * Note that we don't redo IPsec policy lookup here 2267 * since the final destination (or source) didn't change. 2268 */ 2269 mutex_enter(&connp->conn_lock); 2270 connp->conn_v6lastdst = ipv6_all_zeros; 2271 2272 ip_attr_nexthop(coa->coa_ipp, coa->coa_ixa, 2273 &connp->conn_faddr_v6, &nexthop); 2274 saddr = connp->conn_saddr_v6; 2275 faddr = connp->conn_faddr_v6; 2276 fport = connp->conn_fport; 2277 mutex_exit(&connp->conn_lock); 2278 2279 if (!IN6_IS_ADDR_UNSPECIFIED(&faddr) && 2280 !IN6_IS_ADDR_V4MAPPED_ANY(&faddr)) { 2281 (void) ip_attr_connect(connp, coa->coa_ixa, 2282 &saddr, &faddr, &nexthop, fport, NULL, NULL, 2283 IPDF_ALLOW_MCBC | IPDF_VERIFY_DST); 2284 } 2285 } 2286 2287 ixa_refrele(coa->coa_ixa); 2288 2289 if (coa->coa_changed & COA_HEADER_CHANGED) { 2290 /* 2291 * Rebuild the header template if we are connected. 2292 * Otherwise clear conn_v6lastdst so we rebuild the header 2293 * in the data path. 2294 */ 2295 mutex_enter(&connp->conn_lock); 2296 if (!IN6_IS_ADDR_UNSPECIFIED(&connp->conn_faddr_v6) && 2297 !IN6_IS_ADDR_V4MAPPED_ANY(&connp->conn_faddr_v6)) { 2298 err = icmp_build_hdr_template(connp, 2299 &connp->conn_saddr_v6, &connp->conn_faddr_v6, 2300 connp->conn_flowinfo); 2301 if (err != 0) { 2302 mutex_exit(&connp->conn_lock); 2303 return (err); 2304 } 2305 } else { 2306 connp->conn_v6lastdst = ipv6_all_zeros; 2307 } 2308 mutex_exit(&connp->conn_lock); 2309 } 2310 if (coa->coa_changed & COA_RCVBUF_CHANGED) { 2311 (void) proto_set_rx_hiwat(connp->conn_rq, connp, 2312 connp->conn_rcvbuf); 2313 } 2314 if ((coa->coa_changed & COA_SNDBUF_CHANGED) && !IPCL_IS_NONSTR(connp)) { 2315 connp->conn_wq->q_hiwat = connp->conn_sndbuf; 2316 } 2317 if (coa->coa_changed & COA_WROFF_CHANGED) { 2318 /* Increase wroff if needed */ 2319 uint_t wroff; 2320 2321 mutex_enter(&connp->conn_lock); 2322 wroff = connp->conn_ht_iphc_allocated + is->is_wroff_extra; 2323 if (wroff > connp->conn_wroff) { 2324 connp->conn_wroff = wroff; 2325 mutex_exit(&connp->conn_lock); 2326 (void) proto_set_tx_wroff(connp->conn_rq, connp, wroff); 2327 } else { 2328 mutex_exit(&connp->conn_lock); 2329 } 2330 } 2331 if (coa->coa_changed & COA_ICMP_BIND_NEEDED) { 2332 icmp_bind_proto(icmp); 2333 } 2334 return (err); 2335 } 2336 2337 /* This routine sets socket options. */ 2338 int 2339 icmp_tpi_opt_set(queue_t *q, uint_t optset_context, int level, int name, 2340 uint_t inlen, uchar_t *invalp, uint_t *outlenp, uchar_t *outvalp, 2341 void *thisdg_attrs, cred_t *cr) 2342 { 2343 conn_t *connp = Q_TO_CONN(q); 2344 int error; 2345 2346 error = icmp_opt_set(connp, optset_context, level, name, inlen, invalp, 2347 outlenp, outvalp, thisdg_attrs, cr); 2348 return (error); 2349 } 2350 2351 /* 2352 * Setup IP headers. 2353 * 2354 * Note that IP_HDRINCL has ipha_protocol that is different than conn_proto, 2355 * but icmp_output_hdrincl restores ipha_protocol once we return. 2356 */ 2357 mblk_t * 2358 icmp_prepend_hdr(conn_t *connp, ip_xmit_attr_t *ixa, const ip_pkt_t *ipp, 2359 const in6_addr_t *v6src, const in6_addr_t *v6dst, uint32_t flowinfo, 2360 mblk_t *data_mp, int *errorp) 2361 { 2362 mblk_t *mp; 2363 icmp_stack_t *is = connp->conn_netstack->netstack_icmp; 2364 uint_t data_len; 2365 uint32_t cksum; 2366 2367 data_len = msgdsize(data_mp); 2368 mp = conn_prepend_hdr(ixa, ipp, v6src, v6dst, connp->conn_proto, 2369 flowinfo, 0, data_mp, data_len, is->is_wroff_extra, &cksum, errorp); 2370 if (mp == NULL) { 2371 ASSERT(*errorp != 0); 2372 return (NULL); 2373 } 2374 2375 ixa->ixa_pktlen = data_len + ixa->ixa_ip_hdr_length; 2376 2377 /* 2378 * If there was a routing option/header then conn_prepend_hdr 2379 * has massaged it and placed the pseudo-header checksum difference 2380 * in the cksum argument. 2381 * 2382 * Prepare for ICMPv6 checksum done in IP. 2383 * 2384 * We make it easy for IP to include our pseudo header 2385 * by putting our length (and any routing header adjustment) 2386 * in the ICMPv6 checksum field. 2387 * The IP source, destination, and length have already been set by 2388 * conn_prepend_hdr. 2389 */ 2390 cksum += data_len; 2391 cksum = (cksum >> 16) + (cksum & 0xFFFF); 2392 ASSERT(cksum < 0x10000); 2393 2394 if (ixa->ixa_flags & IXAF_IS_IPV4) { 2395 ipha_t *ipha = (ipha_t *)mp->b_rptr; 2396 2397 ASSERT(ntohs(ipha->ipha_length) == ixa->ixa_pktlen); 2398 } else { 2399 ip6_t *ip6h = (ip6_t *)mp->b_rptr; 2400 uint_t cksum_offset = 0; 2401 2402 ASSERT(ntohs(ip6h->ip6_plen) + IPV6_HDR_LEN == ixa->ixa_pktlen); 2403 2404 if (ixa->ixa_flags & IXAF_SET_ULP_CKSUM) { 2405 if (connp->conn_proto == IPPROTO_ICMPV6) { 2406 cksum_offset = ixa->ixa_ip_hdr_length + 2407 offsetof(icmp6_t, icmp6_cksum); 2408 } else if (ixa->ixa_flags & IXAF_SET_RAW_CKSUM) { 2409 cksum_offset = ixa->ixa_ip_hdr_length + 2410 ixa->ixa_raw_cksum_offset; 2411 } 2412 } 2413 if (cksum_offset != 0) { 2414 uint16_t *ptr; 2415 2416 /* Make sure the checksum fits in the first mblk */ 2417 if (cksum_offset + sizeof (short) > MBLKL(mp)) { 2418 mblk_t *mp1; 2419 2420 mp1 = msgpullup(mp, 2421 cksum_offset + sizeof (short)); 2422 freemsg(mp); 2423 if (mp1 == NULL) { 2424 *errorp = ENOMEM; 2425 return (NULL); 2426 } 2427 mp = mp1; 2428 ip6h = (ip6_t *)mp->b_rptr; 2429 } 2430 ptr = (uint16_t *)(mp->b_rptr + cksum_offset); 2431 *ptr = htons(cksum); 2432 } 2433 } 2434 2435 /* Note that we don't try to update wroff due to ancillary data */ 2436 return (mp); 2437 } 2438 2439 static int 2440 icmp_build_hdr_template(conn_t *connp, const in6_addr_t *v6src, 2441 const in6_addr_t *v6dst, uint32_t flowinfo) 2442 { 2443 int error; 2444 2445 ASSERT(MUTEX_HELD(&connp->conn_lock)); 2446 /* 2447 * We clear lastdst to make sure we don't use the lastdst path 2448 * next time sending since we might not have set v6dst yet. 2449 */ 2450 connp->conn_v6lastdst = ipv6_all_zeros; 2451 2452 error = conn_build_hdr_template(connp, 0, 0, v6src, v6dst, flowinfo); 2453 if (error != 0) 2454 return (error); 2455 2456 /* 2457 * Any routing header/option has been massaged. The checksum difference 2458 * is stored in conn_sum. 2459 */ 2460 return (0); 2461 } 2462 2463 static mblk_t * 2464 icmp_queue_fallback(icmp_t *icmp, mblk_t *mp) 2465 { 2466 ASSERT(MUTEX_HELD(&icmp->icmp_recv_lock)); 2467 if (IPCL_IS_NONSTR(icmp->icmp_connp)) { 2468 /* 2469 * fallback has started but messages have not been moved yet 2470 */ 2471 if (icmp->icmp_fallback_queue_head == NULL) { 2472 ASSERT(icmp->icmp_fallback_queue_tail == NULL); 2473 icmp->icmp_fallback_queue_head = mp; 2474 icmp->icmp_fallback_queue_tail = mp; 2475 } else { 2476 ASSERT(icmp->icmp_fallback_queue_tail != NULL); 2477 icmp->icmp_fallback_queue_tail->b_next = mp; 2478 icmp->icmp_fallback_queue_tail = mp; 2479 } 2480 return (NULL); 2481 } else { 2482 /* 2483 * Fallback completed, let the caller putnext() the mblk. 2484 */ 2485 return (mp); 2486 } 2487 } 2488 2489 /* 2490 * Deliver data to ULP. In case we have a socket, and it's falling back to 2491 * TPI, then we'll queue the mp for later processing. 2492 */ 2493 static void 2494 icmp_ulp_recv(conn_t *connp, mblk_t *mp, uint_t len) 2495 { 2496 if (IPCL_IS_NONSTR(connp)) { 2497 icmp_t *icmp = connp->conn_icmp; 2498 int error; 2499 2500 ASSERT(len == msgdsize(mp)); 2501 if ((*connp->conn_upcalls->su_recv) 2502 (connp->conn_upper_handle, mp, len, 0, &error, NULL) < 0) { 2503 mutex_enter(&icmp->icmp_recv_lock); 2504 if (error == ENOSPC) { 2505 /* 2506 * let's confirm while holding the lock 2507 */ 2508 if ((*connp->conn_upcalls->su_recv) 2509 (connp->conn_upper_handle, NULL, 0, 0, 2510 &error, NULL) < 0) { 2511 ASSERT(error == ENOSPC); 2512 if (error == ENOSPC) { 2513 connp->conn_flow_cntrld = 2514 B_TRUE; 2515 } 2516 } 2517 mutex_exit(&icmp->icmp_recv_lock); 2518 } else { 2519 ASSERT(error == EOPNOTSUPP); 2520 mp = icmp_queue_fallback(icmp, mp); 2521 mutex_exit(&icmp->icmp_recv_lock); 2522 if (mp != NULL) 2523 putnext(connp->conn_rq, mp); 2524 } 2525 } 2526 ASSERT(MUTEX_NOT_HELD(&icmp->icmp_recv_lock)); 2527 } else { 2528 putnext(connp->conn_rq, mp); 2529 } 2530 } 2531 2532 /* 2533 * This is the inbound data path. 2534 * IP has already pulled up the IP headers and verified alignment 2535 * etc. 2536 */ 2537 /* ARGSUSED2 */ 2538 static void 2539 icmp_input(void *arg1, mblk_t *mp, void *arg2, ip_recv_attr_t *ira) 2540 { 2541 conn_t *connp = (conn_t *)arg1; 2542 struct T_unitdata_ind *tudi; 2543 uchar_t *rptr; /* Pointer to IP header */ 2544 int ip_hdr_length; 2545 int udi_size; /* Size of T_unitdata_ind */ 2546 int pkt_len; 2547 icmp_t *icmp; 2548 ip_pkt_t ipps; 2549 ip6_t *ip6h; 2550 mblk_t *mp1; 2551 crb_t recv_ancillary; 2552 icmp_stack_t *is; 2553 sin_t *sin; 2554 sin6_t *sin6; 2555 ipha_t *ipha; 2556 2557 ASSERT(connp->conn_flags & IPCL_RAWIPCONN); 2558 2559 icmp = connp->conn_icmp; 2560 is = icmp->icmp_is; 2561 rptr = mp->b_rptr; 2562 2563 ASSERT(DB_TYPE(mp) == M_DATA); 2564 ASSERT(OK_32PTR(rptr)); 2565 ASSERT(ira->ira_pktlen == msgdsize(mp)); 2566 pkt_len = ira->ira_pktlen; 2567 2568 /* 2569 * Get a snapshot of these and allow other threads to change 2570 * them after that. We need the same recv_ancillary when determining 2571 * the size as when adding the ancillary data items. 2572 */ 2573 mutex_enter(&connp->conn_lock); 2574 recv_ancillary = connp->conn_recv_ancillary; 2575 mutex_exit(&connp->conn_lock); 2576 2577 ip_hdr_length = ira->ira_ip_hdr_length; 2578 ASSERT(MBLKL(mp) >= ip_hdr_length); /* IP did a pullup */ 2579 2580 /* Initialize regardless of IP version */ 2581 ipps.ipp_fields = 0; 2582 2583 if (ira->ira_flags & IRAF_IS_IPV4) { 2584 ASSERT(IPH_HDR_VERSION(rptr) == IPV4_VERSION); 2585 ASSERT(MBLKL(mp) >= sizeof (ipha_t)); 2586 ASSERT(ira->ira_ip_hdr_length == IPH_HDR_LENGTH(rptr)); 2587 2588 ipha = (ipha_t *)mp->b_rptr; 2589 if (recv_ancillary.crb_all != 0) 2590 (void) ip_find_hdr_v4(ipha, &ipps, B_FALSE); 2591 2592 /* 2593 * BSD for some reason adjusts ipha_length to exclude the 2594 * IP header length. We do the same. 2595 */ 2596 if (is->is_bsd_compat) { 2597 ushort_t len; 2598 2599 len = ntohs(ipha->ipha_length); 2600 if (mp->b_datap->db_ref > 1) { 2601 /* 2602 * Allocate a new IP header so that we can 2603 * modify ipha_length. 2604 */ 2605 mblk_t *mp1; 2606 2607 mp1 = allocb(ip_hdr_length, BPRI_MED); 2608 if (mp1 == NULL) { 2609 freemsg(mp); 2610 BUMP_MIB(&is->is_rawip_mib, 2611 rawipInErrors); 2612 return; 2613 } 2614 bcopy(rptr, mp1->b_rptr, ip_hdr_length); 2615 mp->b_rptr = rptr + ip_hdr_length; 2616 rptr = mp1->b_rptr; 2617 ipha = (ipha_t *)rptr; 2618 mp1->b_cont = mp; 2619 mp1->b_wptr = rptr + ip_hdr_length; 2620 mp = mp1; 2621 } 2622 len -= ip_hdr_length; 2623 ipha->ipha_length = htons(len); 2624 } 2625 2626 /* 2627 * For RAW sockets we not pass ICMP/IPv4 packets to AF_INET6 2628 * sockets. This is ensured by icmp_bind and the IP fanout code. 2629 */ 2630 ASSERT(connp->conn_family == AF_INET); 2631 2632 /* 2633 * This is the inbound data path. Packets are passed upstream 2634 * as T_UNITDATA_IND messages with full IPv4 headers still 2635 * attached. 2636 */ 2637 2638 /* 2639 * Normally only send up the source address. 2640 * If any ancillary data items are wanted we add those. 2641 */ 2642 udi_size = sizeof (struct T_unitdata_ind) + sizeof (sin_t); 2643 if (recv_ancillary.crb_all != 0) { 2644 udi_size += conn_recvancillary_size(connp, 2645 recv_ancillary, ira, mp, &ipps); 2646 } 2647 2648 /* Allocate a message block for the T_UNITDATA_IND structure. */ 2649 mp1 = allocb(udi_size, BPRI_MED); 2650 if (mp1 == NULL) { 2651 freemsg(mp); 2652 BUMP_MIB(&is->is_rawip_mib, rawipInErrors); 2653 return; 2654 } 2655 mp1->b_cont = mp; 2656 tudi = (struct T_unitdata_ind *)mp1->b_rptr; 2657 mp1->b_datap->db_type = M_PROTO; 2658 mp1->b_wptr = (uchar_t *)tudi + udi_size; 2659 tudi->PRIM_type = T_UNITDATA_IND; 2660 tudi->SRC_length = sizeof (sin_t); 2661 tudi->SRC_offset = sizeof (struct T_unitdata_ind); 2662 sin = (sin_t *)&tudi[1]; 2663 *sin = sin_null; 2664 sin->sin_family = AF_INET; 2665 sin->sin_addr.s_addr = ipha->ipha_src; 2666 *(uint32_t *)&sin->sin_zero[0] = 0; 2667 *(uint32_t *)&sin->sin_zero[4] = 0; 2668 tudi->OPT_offset = sizeof (struct T_unitdata_ind) + 2669 sizeof (sin_t); 2670 udi_size -= (sizeof (struct T_unitdata_ind) + sizeof (sin_t)); 2671 tudi->OPT_length = udi_size; 2672 2673 /* 2674 * Add options if IP_RECVIF etc is set 2675 */ 2676 if (udi_size != 0) { 2677 conn_recvancillary_add(connp, recv_ancillary, ira, 2678 &ipps, (uchar_t *)&sin[1], udi_size); 2679 } 2680 goto deliver; 2681 } 2682 2683 ASSERT(IPH_HDR_VERSION(rptr) == IPV6_VERSION); 2684 /* 2685 * IPv6 packets can only be received by applications 2686 * that are prepared to receive IPv6 addresses. 2687 * The IP fanout must ensure this. 2688 */ 2689 ASSERT(connp->conn_family == AF_INET6); 2690 2691 /* 2692 * Handle IPv6 packets. We don't pass up the IP headers with the 2693 * payload for IPv6. 2694 */ 2695 2696 ip6h = (ip6_t *)rptr; 2697 if (recv_ancillary.crb_all != 0) { 2698 /* 2699 * Call on ip_find_hdr_v6 which gets individual lenghts of 2700 * extension headers (and pointers to them). 2701 */ 2702 uint8_t nexthdr; 2703 2704 /* We don't care about the length or nextheader. */ 2705 (void) ip_find_hdr_v6(mp, ip6h, B_TRUE, &ipps, &nexthdr); 2706 2707 /* 2708 * We do not pass up hop-by-hop options or any other 2709 * extension header as part of the packet. Applications 2710 * that want to see them have to specify IPV6_RECV* socket 2711 * options. And conn_recvancillary_size/add explicitly 2712 * drops the TX option from IPV6_HOPOPTS as it does for UDP. 2713 * 2714 * If we had multilevel ICMP sockets, then we'd want to 2715 * modify conn_recvancillary_size/add to 2716 * allow the user to see the label. 2717 */ 2718 } 2719 2720 /* 2721 * Check a filter for ICMPv6 types if needed. 2722 * Verify raw checksums if needed. 2723 */ 2724 mutex_enter(&connp->conn_lock); 2725 if (icmp->icmp_filter != NULL) { 2726 int type; 2727 2728 /* Assumes that IP has done the pullupmsg */ 2729 type = mp->b_rptr[ip_hdr_length]; 2730 2731 ASSERT(mp->b_rptr + ip_hdr_length <= mp->b_wptr); 2732 if (ICMP6_FILTER_WILLBLOCK(type, icmp->icmp_filter)) { 2733 mutex_exit(&connp->conn_lock); 2734 freemsg(mp); 2735 return; 2736 } 2737 } 2738 if (connp->conn_ixa->ixa_flags & IXAF_SET_RAW_CKSUM) { 2739 /* Checksum */ 2740 uint16_t *up; 2741 uint32_t sum; 2742 int remlen; 2743 2744 up = (uint16_t *)&ip6h->ip6_src; 2745 2746 remlen = msgdsize(mp) - ip_hdr_length; 2747 sum = htons(connp->conn_proto + remlen) 2748 + up[0] + up[1] + up[2] + up[3] 2749 + up[4] + up[5] + up[6] + up[7] 2750 + up[8] + up[9] + up[10] + up[11] 2751 + up[12] + up[13] + up[14] + up[15]; 2752 sum = (sum & 0xffff) + (sum >> 16); 2753 sum = IP_CSUM(mp, ip_hdr_length, sum); 2754 if (sum != 0) { 2755 /* IPv6 RAW checksum failed */ 2756 ip0dbg(("icmp_rput: RAW checksum failed %x\n", sum)); 2757 mutex_exit(&connp->conn_lock); 2758 freemsg(mp); 2759 BUMP_MIB(&is->is_rawip_mib, rawipInCksumErrs); 2760 return; 2761 } 2762 } 2763 mutex_exit(&connp->conn_lock); 2764 2765 udi_size = sizeof (struct T_unitdata_ind) + sizeof (sin6_t); 2766 2767 if (recv_ancillary.crb_all != 0) { 2768 udi_size += conn_recvancillary_size(connp, 2769 recv_ancillary, ira, mp, &ipps); 2770 } 2771 2772 mp1 = allocb(udi_size, BPRI_MED); 2773 if (mp1 == NULL) { 2774 freemsg(mp); 2775 BUMP_MIB(&is->is_rawip_mib, rawipInErrors); 2776 return; 2777 } 2778 mp1->b_cont = mp; 2779 mp1->b_datap->db_type = M_PROTO; 2780 tudi = (struct T_unitdata_ind *)mp1->b_rptr; 2781 mp1->b_wptr = (uchar_t *)tudi + udi_size; 2782 tudi->PRIM_type = T_UNITDATA_IND; 2783 tudi->SRC_length = sizeof (sin6_t); 2784 tudi->SRC_offset = sizeof (struct T_unitdata_ind); 2785 tudi->OPT_offset = sizeof (struct T_unitdata_ind) + sizeof (sin6_t); 2786 udi_size -= (sizeof (struct T_unitdata_ind) + sizeof (sin6_t)); 2787 tudi->OPT_length = udi_size; 2788 sin6 = (sin6_t *)&tudi[1]; 2789 *sin6 = sin6_null; 2790 sin6->sin6_port = 0; 2791 sin6->sin6_family = AF_INET6; 2792 2793 sin6->sin6_addr = ip6h->ip6_src; 2794 /* No sin6_flowinfo per API */ 2795 sin6->sin6_flowinfo = 0; 2796 /* For link-scope pass up scope id */ 2797 if (IN6_IS_ADDR_LINKSCOPE(&ip6h->ip6_src)) 2798 sin6->sin6_scope_id = ira->ira_ruifindex; 2799 else 2800 sin6->sin6_scope_id = 0; 2801 sin6->__sin6_src_id = ip_srcid_find_addr(&ip6h->ip6_dst, 2802 IPCL_ZONEID(connp), is->is_netstack); 2803 2804 if (udi_size != 0) { 2805 conn_recvancillary_add(connp, recv_ancillary, ira, 2806 &ipps, (uchar_t *)&sin6[1], udi_size); 2807 } 2808 2809 /* Skip all the IPv6 headers per API */ 2810 mp->b_rptr += ip_hdr_length; 2811 pkt_len -= ip_hdr_length; 2812 2813 deliver: 2814 BUMP_MIB(&is->is_rawip_mib, rawipInDatagrams); 2815 icmp_ulp_recv(connp, mp1, pkt_len); 2816 } 2817 2818 /* 2819 * return SNMP stuff in buffer in mpdata. We don't hold any lock and report 2820 * information that can be changing beneath us. 2821 */ 2822 mblk_t * 2823 icmp_snmp_get(queue_t *q, mblk_t *mpctl) 2824 { 2825 mblk_t *mpdata; 2826 struct opthdr *optp; 2827 conn_t *connp = Q_TO_CONN(q); 2828 icmp_stack_t *is = connp->conn_netstack->netstack_icmp; 2829 mblk_t *mp2ctl; 2830 2831 /* 2832 * make a copy of the original message 2833 */ 2834 mp2ctl = copymsg(mpctl); 2835 2836 if (mpctl == NULL || 2837 (mpdata = mpctl->b_cont) == NULL) { 2838 freemsg(mpctl); 2839 freemsg(mp2ctl); 2840 return (0); 2841 } 2842 2843 /* fixed length structure for IPv4 and IPv6 counters */ 2844 optp = (struct opthdr *)&mpctl->b_rptr[sizeof (struct T_optmgmt_ack)]; 2845 optp->level = EXPER_RAWIP; 2846 optp->name = 0; 2847 (void) snmp_append_data(mpdata, (char *)&is->is_rawip_mib, 2848 sizeof (is->is_rawip_mib)); 2849 optp->len = msgdsize(mpdata); 2850 qreply(q, mpctl); 2851 2852 return (mp2ctl); 2853 } 2854 2855 /* 2856 * Return 0 if invalid set request, 1 otherwise, including non-rawip requests. 2857 * TODO: If this ever actually tries to set anything, it needs to be 2858 * to do the appropriate locking. 2859 */ 2860 /* ARGSUSED */ 2861 int 2862 icmp_snmp_set(queue_t *q, t_scalar_t level, t_scalar_t name, 2863 uchar_t *ptr, int len) 2864 { 2865 switch (level) { 2866 case EXPER_RAWIP: 2867 return (0); 2868 default: 2869 return (1); 2870 } 2871 } 2872 2873 /* 2874 * This routine creates a T_UDERROR_IND message and passes it upstream. 2875 * The address and options are copied from the T_UNITDATA_REQ message 2876 * passed in mp. This message is freed. 2877 */ 2878 static void 2879 icmp_ud_err(queue_t *q, mblk_t *mp, t_scalar_t err) 2880 { 2881 struct T_unitdata_req *tudr; 2882 mblk_t *mp1; 2883 uchar_t *destaddr; 2884 t_scalar_t destlen; 2885 uchar_t *optaddr; 2886 t_scalar_t optlen; 2887 2888 if ((mp->b_wptr < mp->b_rptr) || 2889 (MBLKL(mp)) < sizeof (struct T_unitdata_req)) { 2890 goto done; 2891 } 2892 tudr = (struct T_unitdata_req *)mp->b_rptr; 2893 destaddr = mp->b_rptr + tudr->DEST_offset; 2894 if (destaddr < mp->b_rptr || destaddr >= mp->b_wptr || 2895 destaddr + tudr->DEST_length < mp->b_rptr || 2896 destaddr + tudr->DEST_length > mp->b_wptr) { 2897 goto done; 2898 } 2899 optaddr = mp->b_rptr + tudr->OPT_offset; 2900 if (optaddr < mp->b_rptr || optaddr >= mp->b_wptr || 2901 optaddr + tudr->OPT_length < mp->b_rptr || 2902 optaddr + tudr->OPT_length > mp->b_wptr) { 2903 goto done; 2904 } 2905 destlen = tudr->DEST_length; 2906 optlen = tudr->OPT_length; 2907 2908 mp1 = mi_tpi_uderror_ind((char *)destaddr, destlen, 2909 (char *)optaddr, optlen, err); 2910 if (mp1 != NULL) 2911 qreply(q, mp1); 2912 2913 done: 2914 freemsg(mp); 2915 } 2916 2917 static int 2918 rawip_do_unbind(conn_t *connp) 2919 { 2920 icmp_t *icmp = connp->conn_icmp; 2921 2922 mutex_enter(&connp->conn_lock); 2923 /* If a bind has not been done, we can't unbind. */ 2924 if (icmp->icmp_state == TS_UNBND) { 2925 mutex_exit(&connp->conn_lock); 2926 return (-TOUTSTATE); 2927 } 2928 connp->conn_saddr_v6 = ipv6_all_zeros; 2929 connp->conn_bound_addr_v6 = ipv6_all_zeros; 2930 connp->conn_laddr_v6 = ipv6_all_zeros; 2931 connp->conn_mcbc_bind = B_FALSE; 2932 connp->conn_lport = 0; 2933 connp->conn_fport = 0; 2934 /* In case we were also connected */ 2935 connp->conn_faddr_v6 = ipv6_all_zeros; 2936 connp->conn_v6lastdst = ipv6_all_zeros; 2937 2938 icmp->icmp_state = TS_UNBND; 2939 2940 (void) icmp_build_hdr_template(connp, &connp->conn_saddr_v6, 2941 &connp->conn_faddr_v6, connp->conn_flowinfo); 2942 mutex_exit(&connp->conn_lock); 2943 2944 ip_unbind(connp); 2945 return (0); 2946 } 2947 2948 /* 2949 * This routine is called by icmp_wput to handle T_UNBIND_REQ messages. 2950 * After some error checking, the message is passed downstream to ip. 2951 */ 2952 static void 2953 icmp_tpi_unbind(queue_t *q, mblk_t *mp) 2954 { 2955 conn_t *connp = Q_TO_CONN(q); 2956 int error; 2957 2958 ASSERT(mp->b_cont == NULL); 2959 error = rawip_do_unbind(connp); 2960 if (error) { 2961 if (error < 0) { 2962 icmp_err_ack(q, mp, -error, 0); 2963 } else { 2964 icmp_err_ack(q, mp, 0, error); 2965 } 2966 return; 2967 } 2968 2969 /* 2970 * Convert mp into a T_OK_ACK 2971 */ 2972 2973 mp = mi_tpi_ok_ack_alloc(mp); 2974 2975 /* 2976 * should not happen in practice... T_OK_ACK is smaller than the 2977 * original message. 2978 */ 2979 ASSERT(mp != NULL); 2980 ASSERT(((struct T_ok_ack *)mp->b_rptr)->PRIM_type == T_OK_ACK); 2981 qreply(q, mp); 2982 } 2983 2984 /* 2985 * Process IPv4 packets that already include an IP header. 2986 * Used when IP_HDRINCL has been set (implicit for IPPROTO_RAW and 2987 * IPPROTO_IGMP). 2988 * In this case we ignore the address and any options in the T_UNITDATA_REQ. 2989 * 2990 * The packet is assumed to have a base (20 byte) IP header followed 2991 * by the upper-layer protocol. We include any IP_OPTIONS including a 2992 * CIPSO label but otherwise preserve the base IP header. 2993 */ 2994 static int 2995 icmp_output_hdrincl(conn_t *connp, mblk_t *mp, cred_t *cr, pid_t pid) 2996 { 2997 icmp_t *icmp = connp->conn_icmp; 2998 icmp_stack_t *is = icmp->icmp_is; 2999 ipha_t iphas; 3000 ipha_t *ipha; 3001 int ip_hdr_length; 3002 int tp_hdr_len; 3003 ip_xmit_attr_t *ixa; 3004 ip_pkt_t *ipp; 3005 in6_addr_t v6src; 3006 in6_addr_t v6dst; 3007 in6_addr_t v6nexthop; 3008 int error; 3009 boolean_t do_ipsec; 3010 3011 /* 3012 * We need an exclusive copy of conn_ixa since the included IP 3013 * header could have any destination. 3014 * That copy has no pointers hence we 3015 * need to set them up once we've parsed the ancillary data. 3016 */ 3017 ixa = conn_get_ixa_exclusive(connp); 3018 if (ixa == NULL) { 3019 BUMP_MIB(&is->is_rawip_mib, rawipOutErrors); 3020 freemsg(mp); 3021 return (ENOMEM); 3022 } 3023 ASSERT(cr != NULL); 3024 /* 3025 * Caller has a reference on cr; from db_credp or because we 3026 * are running in process context. 3027 */ 3028 ASSERT(!(ixa->ixa_free_flags & IXA_FREE_CRED)); 3029 ixa->ixa_cred = cr; 3030 ixa->ixa_cpid = pid; 3031 if (is_system_labeled()) { 3032 /* We need to restart with a label based on the cred */ 3033 ip_xmit_attr_restore_tsl(ixa, ixa->ixa_cred); 3034 } 3035 3036 /* In case previous destination was multicast or multirt */ 3037 ip_attr_newdst(ixa); 3038 3039 /* Get a copy of conn_xmit_ipp since the TX label might change it */ 3040 ipp = kmem_zalloc(sizeof (*ipp), KM_NOSLEEP); 3041 if (ipp == NULL) { 3042 ASSERT(!(ixa->ixa_free_flags & IXA_FREE_CRED)); 3043 ixa->ixa_cred = connp->conn_cred; /* Restore */ 3044 ixa->ixa_cpid = connp->conn_cpid; 3045 ixa_refrele(ixa); 3046 BUMP_MIB(&is->is_rawip_mib, rawipOutErrors); 3047 freemsg(mp); 3048 return (ENOMEM); 3049 } 3050 mutex_enter(&connp->conn_lock); 3051 error = ip_pkt_copy(&connp->conn_xmit_ipp, ipp, KM_NOSLEEP); 3052 mutex_exit(&connp->conn_lock); 3053 if (error != 0) { 3054 BUMP_MIB(&is->is_rawip_mib, rawipOutErrors); 3055 freemsg(mp); 3056 goto done; 3057 } 3058 3059 /* Sanity check length of packet */ 3060 ipha = (ipha_t *)mp->b_rptr; 3061 3062 ip_hdr_length = IP_SIMPLE_HDR_LENGTH; 3063 if ((mp->b_wptr - mp->b_rptr) < IP_SIMPLE_HDR_LENGTH) { 3064 if (!pullupmsg(mp, IP_SIMPLE_HDR_LENGTH)) { 3065 BUMP_MIB(&is->is_rawip_mib, rawipOutErrors); 3066 freemsg(mp); 3067 goto done; 3068 } 3069 ipha = (ipha_t *)mp->b_rptr; 3070 } 3071 ipha->ipha_version_and_hdr_length = 3072 (IP_VERSION<<4) | (ip_hdr_length>>2); 3073 3074 /* 3075 * We set IXAF_DONTFRAG if the application set DF which makes 3076 * IP not fragment. 3077 */ 3078 ipha->ipha_fragment_offset_and_flags &= htons(IPH_DF); 3079 if (ipha->ipha_fragment_offset_and_flags & htons(IPH_DF)) 3080 ixa->ixa_flags |= (IXAF_DONTFRAG | IXAF_PMTU_IPV4_DF); 3081 else 3082 ixa->ixa_flags &= ~(IXAF_DONTFRAG | IXAF_PMTU_IPV4_DF); 3083 3084 /* Even for multicast and broadcast we honor the apps ttl */ 3085 ixa->ixa_flags |= IXAF_NO_TTL_CHANGE; 3086 3087 /* 3088 * No source verification for non-local addresses 3089 */ 3090 if (ipha->ipha_src != INADDR_ANY && 3091 ip_laddr_verify_v4(ipha->ipha_src, ixa->ixa_zoneid, 3092 is->is_netstack->netstack_ip, B_FALSE) 3093 != IPVL_UNICAST_UP) { 3094 ixa->ixa_flags &= ~IXAF_VERIFY_SOURCE; 3095 } 3096 3097 if (ipha->ipha_dst == INADDR_ANY) 3098 ipha->ipha_dst = htonl(INADDR_LOOPBACK); 3099 3100 IN6_IPADDR_TO_V4MAPPED(ipha->ipha_src, &v6src); 3101 IN6_IPADDR_TO_V4MAPPED(ipha->ipha_dst, &v6dst); 3102 3103 /* Defer IPsec if it might need to look at ICMP type/code */ 3104 do_ipsec = ipha->ipha_protocol != IPPROTO_ICMP; 3105 ixa->ixa_flags |= IXAF_IS_IPV4; 3106 3107 ip_attr_nexthop(ipp, ixa, &v6dst, &v6nexthop); 3108 error = ip_attr_connect(connp, ixa, &v6src, &v6dst, &v6nexthop, 3109 connp->conn_fport, &v6src, NULL, IPDF_ALLOW_MCBC | IPDF_VERIFY_DST | 3110 (do_ipsec ? IPDF_IPSEC : 0)); 3111 switch (error) { 3112 case 0: 3113 break; 3114 case EADDRNOTAVAIL: 3115 /* 3116 * IXAF_VERIFY_SOURCE tells us to pick a better source. 3117 * Don't have the application see that errno 3118 */ 3119 error = ENETUNREACH; 3120 goto failed; 3121 case ENETDOWN: 3122 /* 3123 * Have !ipif_addr_ready address; drop packet silently 3124 * until we can get applications to not send until we 3125 * are ready. 3126 */ 3127 error = 0; 3128 goto failed; 3129 case EHOSTUNREACH: 3130 case ENETUNREACH: 3131 if (ixa->ixa_ire != NULL) { 3132 /* 3133 * Let conn_ip_output/ire_send_noroute return 3134 * the error and send any local ICMP error. 3135 */ 3136 error = 0; 3137 break; 3138 } 3139 /* FALLTHRU */ 3140 default: 3141 failed: 3142 freemsg(mp); 3143 BUMP_MIB(&is->is_rawip_mib, rawipOutErrors); 3144 goto done; 3145 } 3146 if (ipha->ipha_src == INADDR_ANY) 3147 IN6_V4MAPPED_TO_IPADDR(&v6src, ipha->ipha_src); 3148 3149 /* 3150 * We might be going to a different destination than last time, 3151 * thus check that TX allows the communication and compute any 3152 * needed label. 3153 * 3154 * TSOL Note: We have an exclusive ipp and ixa for this thread so we 3155 * don't have to worry about concurrent threads. 3156 */ 3157 if (is_system_labeled()) { 3158 /* 3159 * Check whether Trusted Solaris policy allows communication 3160 * with this host, and pretend that the destination is 3161 * unreachable if not. 3162 * Compute any needed label and place it in ipp_label_v4/v6. 3163 * 3164 * Later conn_build_hdr_template/conn_prepend_hdr takes 3165 * ipp_label_v4/v6 to form the packet. 3166 * 3167 * Tsol note: We have ipp structure local to this thread so 3168 * no locking is needed. 3169 */ 3170 error = conn_update_label(connp, ixa, &v6dst, ipp); 3171 if (error != 0) { 3172 freemsg(mp); 3173 BUMP_MIB(&is->is_rawip_mib, rawipOutErrors); 3174 goto done; 3175 } 3176 } 3177 3178 /* 3179 * Save away a copy of the IPv4 header the application passed down 3180 * and then prepend an IPv4 header complete with any IP options 3181 * including label. 3182 * We need a struct copy since icmp_prepend_hdr will reuse the available 3183 * space in the mblk. 3184 */ 3185 iphas = *ipha; 3186 mp->b_rptr += IP_SIMPLE_HDR_LENGTH; 3187 3188 mp = icmp_prepend_hdr(connp, ixa, ipp, &v6src, &v6dst, 0, mp, &error); 3189 if (mp == NULL) { 3190 BUMP_MIB(&is->is_rawip_mib, rawipOutErrors); 3191 ASSERT(error != 0); 3192 goto done; 3193 } 3194 if (ixa->ixa_pktlen > IP_MAXPACKET) { 3195 error = EMSGSIZE; 3196 BUMP_MIB(&is->is_rawip_mib, rawipOutErrors); 3197 freemsg(mp); 3198 goto done; 3199 } 3200 /* Restore key parts of the header that the application passed down */ 3201 ipha = (ipha_t *)mp->b_rptr; 3202 ipha->ipha_type_of_service = iphas.ipha_type_of_service; 3203 ipha->ipha_ident = iphas.ipha_ident; 3204 ipha->ipha_fragment_offset_and_flags = 3205 iphas.ipha_fragment_offset_and_flags; 3206 ipha->ipha_ttl = iphas.ipha_ttl; 3207 ipha->ipha_protocol = iphas.ipha_protocol; 3208 ipha->ipha_src = iphas.ipha_src; 3209 ipha->ipha_dst = iphas.ipha_dst; 3210 3211 ixa->ixa_protocol = ipha->ipha_protocol; 3212 3213 /* 3214 * Make sure that the IP header plus any transport header that is 3215 * checksumed by ip_output is in the first mblk. (ip_output assumes 3216 * that at least the checksum field is in the first mblk.) 3217 */ 3218 switch (ipha->ipha_protocol) { 3219 case IPPROTO_UDP: 3220 tp_hdr_len = 8; 3221 break; 3222 case IPPROTO_TCP: 3223 tp_hdr_len = 20; 3224 break; 3225 default: 3226 tp_hdr_len = 0; 3227 break; 3228 } 3229 ip_hdr_length = IPH_HDR_LENGTH(ipha); 3230 if (mp->b_wptr - mp->b_rptr < ip_hdr_length + tp_hdr_len) { 3231 if (!pullupmsg(mp, ip_hdr_length + tp_hdr_len)) { 3232 BUMP_MIB(&is->is_rawip_mib, rawipOutErrors); 3233 if (mp->b_cont == NULL) 3234 error = EINVAL; 3235 else 3236 error = ENOMEM; 3237 freemsg(mp); 3238 goto done; 3239 } 3240 } 3241 3242 if (!do_ipsec) { 3243 /* Policy might differ for different ICMP type/code */ 3244 if (ixa->ixa_ipsec_policy != NULL) { 3245 IPPOL_REFRELE(ixa->ixa_ipsec_policy); 3246 ixa->ixa_ipsec_policy = NULL; 3247 ixa->ixa_flags &= ~IXAF_IPSEC_SECURE; 3248 } 3249 mp = ip_output_attach_policy(mp, ipha, NULL, connp, ixa); 3250 if (mp == NULL) { 3251 BUMP_MIB(&is->is_rawip_mib, rawipOutErrors); 3252 error = EHOSTUNREACH; /* IPsec policy failure */ 3253 goto done; 3254 } 3255 } 3256 3257 /* We're done. Pass the packet to ip. */ 3258 BUMP_MIB(&is->is_rawip_mib, rawipOutDatagrams); 3259 3260 error = conn_ip_output(mp, ixa); 3261 /* No rawipOutErrors if an error since IP increases its error counter */ 3262 switch (error) { 3263 case 0: 3264 break; 3265 case EWOULDBLOCK: 3266 (void) ixa_check_drain_insert(connp, ixa); 3267 error = 0; 3268 break; 3269 case EADDRNOTAVAIL: 3270 /* 3271 * IXAF_VERIFY_SOURCE tells us to pick a better source. 3272 * Don't have the application see that errno 3273 */ 3274 error = ENETUNREACH; 3275 break; 3276 } 3277 done: 3278 ASSERT(!(ixa->ixa_free_flags & IXA_FREE_CRED)); 3279 ixa->ixa_cred = connp->conn_cred; /* Restore */ 3280 ixa->ixa_cpid = connp->conn_cpid; 3281 ixa_refrele(ixa); 3282 ip_pkt_free(ipp); 3283 kmem_free(ipp, sizeof (*ipp)); 3284 return (error); 3285 } 3286 3287 static mblk_t * 3288 icmp_output_attach_policy(mblk_t *mp, conn_t *connp, ip_xmit_attr_t *ixa) 3289 { 3290 ipha_t *ipha = NULL; 3291 ip6_t *ip6h = NULL; 3292 3293 if (ixa->ixa_flags & IXAF_IS_IPV4) 3294 ipha = (ipha_t *)mp->b_rptr; 3295 else 3296 ip6h = (ip6_t *)mp->b_rptr; 3297 3298 if (ixa->ixa_ipsec_policy != NULL) { 3299 IPPOL_REFRELE(ixa->ixa_ipsec_policy); 3300 ixa->ixa_ipsec_policy = NULL; 3301 ixa->ixa_flags &= ~IXAF_IPSEC_SECURE; 3302 } 3303 return (ip_output_attach_policy(mp, ipha, ip6h, connp, ixa)); 3304 } 3305 3306 /* 3307 * Handle T_UNITDATA_REQ with options. Both IPv4 and IPv6 3308 * Either tudr_mp or msg is set. If tudr_mp we take ancillary data from 3309 * the TPI options, otherwise we take them from msg_control. 3310 * If both sin and sin6 is set it is a connected socket and we use conn_faddr. 3311 * Always consumes mp; never consumes tudr_mp. 3312 */ 3313 static int 3314 icmp_output_ancillary(conn_t *connp, sin_t *sin, sin6_t *sin6, mblk_t *mp, 3315 mblk_t *tudr_mp, struct nmsghdr *msg, cred_t *cr, pid_t pid) 3316 { 3317 icmp_t *icmp = connp->conn_icmp; 3318 icmp_stack_t *is = icmp->icmp_is; 3319 int error; 3320 ip_xmit_attr_t *ixa; 3321 ip_pkt_t *ipp; 3322 in6_addr_t v6src; 3323 in6_addr_t v6dst; 3324 in6_addr_t v6nexthop; 3325 in_port_t dstport; 3326 uint32_t flowinfo; 3327 uint_t srcid; 3328 int is_absreq_failure = 0; 3329 conn_opt_arg_t coas, *coa; 3330 3331 ASSERT(tudr_mp != NULL || msg != NULL); 3332 3333 /* 3334 * Get ixa before checking state to handle a disconnect race. 3335 * 3336 * We need an exclusive copy of conn_ixa since the ancillary data 3337 * options might modify it. That copy has no pointers hence we 3338 * need to set them up once we've parsed the ancillary data. 3339 */ 3340 ixa = conn_get_ixa_exclusive(connp); 3341 if (ixa == NULL) { 3342 BUMP_MIB(&is->is_rawip_mib, rawipOutErrors); 3343 freemsg(mp); 3344 return (ENOMEM); 3345 } 3346 ASSERT(cr != NULL); 3347 ASSERT(!(ixa->ixa_free_flags & IXA_FREE_CRED)); 3348 ixa->ixa_cred = cr; 3349 ixa->ixa_cpid = pid; 3350 if (is_system_labeled()) { 3351 /* We need to restart with a label based on the cred */ 3352 ip_xmit_attr_restore_tsl(ixa, ixa->ixa_cred); 3353 } 3354 3355 /* In case previous destination was multicast or multirt */ 3356 ip_attr_newdst(ixa); 3357 3358 /* Get a copy of conn_xmit_ipp since the options might change it */ 3359 ipp = kmem_zalloc(sizeof (*ipp), KM_NOSLEEP); 3360 if (ipp == NULL) { 3361 ASSERT(!(ixa->ixa_free_flags & IXA_FREE_CRED)); 3362 ixa->ixa_cred = connp->conn_cred; /* Restore */ 3363 ixa->ixa_cpid = connp->conn_cpid; 3364 ixa_refrele(ixa); 3365 BUMP_MIB(&is->is_rawip_mib, rawipOutErrors); 3366 freemsg(mp); 3367 return (ENOMEM); 3368 } 3369 mutex_enter(&connp->conn_lock); 3370 error = ip_pkt_copy(&connp->conn_xmit_ipp, ipp, KM_NOSLEEP); 3371 mutex_exit(&connp->conn_lock); 3372 if (error != 0) { 3373 BUMP_MIB(&is->is_rawip_mib, rawipOutErrors); 3374 freemsg(mp); 3375 goto done; 3376 } 3377 3378 /* 3379 * Parse the options and update ixa and ipp as a result. 3380 */ 3381 3382 coa = &coas; 3383 coa->coa_connp = connp; 3384 coa->coa_ixa = ixa; 3385 coa->coa_ipp = ipp; 3386 coa->coa_ancillary = B_TRUE; 3387 coa->coa_changed = 0; 3388 3389 if (msg != NULL) { 3390 error = process_auxiliary_options(connp, msg->msg_control, 3391 msg->msg_controllen, coa, &icmp_opt_obj, icmp_opt_set, cr); 3392 } else { 3393 struct T_unitdata_req *tudr; 3394 3395 tudr = (struct T_unitdata_req *)tudr_mp->b_rptr; 3396 ASSERT(tudr->PRIM_type == T_UNITDATA_REQ); 3397 error = tpi_optcom_buf(connp->conn_wq, tudr_mp, 3398 &tudr->OPT_length, tudr->OPT_offset, cr, &icmp_opt_obj, 3399 coa, &is_absreq_failure); 3400 } 3401 if (error != 0) { 3402 /* 3403 * Note: No special action needed in this 3404 * module for "is_absreq_failure" 3405 */ 3406 freemsg(mp); 3407 BUMP_MIB(&is->is_rawip_mib, rawipOutErrors); 3408 goto done; 3409 } 3410 ASSERT(is_absreq_failure == 0); 3411 3412 mutex_enter(&connp->conn_lock); 3413 /* 3414 * If laddr is unspecified then we look at sin6_src_id. 3415 * We will give precedence to a source address set with IPV6_PKTINFO 3416 * (aka IPPF_ADDR) but that is handled in build_hdrs. However, we don't 3417 * want ip_attr_connect to select a source (since it can fail) when 3418 * IPV6_PKTINFO is specified. 3419 * If this doesn't result in a source address then we get a source 3420 * from ip_attr_connect() below. 3421 */ 3422 v6src = connp->conn_saddr_v6; 3423 if (sin != NULL) { 3424 IN6_IPADDR_TO_V4MAPPED(sin->sin_addr.s_addr, &v6dst); 3425 dstport = sin->sin_port; 3426 flowinfo = 0; 3427 ixa->ixa_flags &= ~IXAF_SCOPEID_SET; 3428 ixa->ixa_flags |= IXAF_IS_IPV4; 3429 } else if (sin6 != NULL) { 3430 v6dst = sin6->sin6_addr; 3431 dstport = sin6->sin6_port; 3432 flowinfo = sin6->sin6_flowinfo; 3433 srcid = sin6->__sin6_src_id; 3434 if (IN6_IS_ADDR_LINKSCOPE(&v6dst) && sin6->sin6_scope_id != 0) { 3435 ixa->ixa_scopeid = sin6->sin6_scope_id; 3436 ixa->ixa_flags |= IXAF_SCOPEID_SET; 3437 } else { 3438 ixa->ixa_flags &= ~IXAF_SCOPEID_SET; 3439 } 3440 if (srcid != 0 && IN6_IS_ADDR_UNSPECIFIED(&v6src)) { 3441 ip_srcid_find_id(srcid, &v6src, IPCL_ZONEID(connp), 3442 connp->conn_netstack); 3443 } 3444 if (IN6_IS_ADDR_V4MAPPED(&v6dst)) 3445 ixa->ixa_flags |= IXAF_IS_IPV4; 3446 else 3447 ixa->ixa_flags &= ~IXAF_IS_IPV4; 3448 } else { 3449 /* Connected case */ 3450 v6dst = connp->conn_faddr_v6; 3451 flowinfo = connp->conn_flowinfo; 3452 } 3453 mutex_exit(&connp->conn_lock); 3454 /* Handle IPV6_PKTINFO setting source address. */ 3455 if (IN6_IS_ADDR_UNSPECIFIED(&v6src) && 3456 (ipp->ipp_fields & IPPF_ADDR)) { 3457 if (ixa->ixa_flags & IXAF_IS_IPV4) { 3458 if (IN6_IS_ADDR_V4MAPPED(&ipp->ipp_addr)) 3459 v6src = ipp->ipp_addr; 3460 } else { 3461 if (!IN6_IS_ADDR_V4MAPPED(&ipp->ipp_addr)) 3462 v6src = ipp->ipp_addr; 3463 } 3464 } 3465 /* 3466 * Allow source not assigned to the system 3467 * only if it is not a local addresses 3468 */ 3469 if (!V6_OR_V4_INADDR_ANY(v6src)) { 3470 ip_laddr_t laddr_type; 3471 3472 if (ixa->ixa_flags & IXAF_IS_IPV4) { 3473 ipaddr_t v4src; 3474 3475 IN6_V4MAPPED_TO_IPADDR(&v6src, v4src); 3476 laddr_type = ip_laddr_verify_v4(v4src, ixa->ixa_zoneid, 3477 is->is_netstack->netstack_ip, B_FALSE); 3478 } else { 3479 laddr_type = ip_laddr_verify_v6(&v6src, ixa->ixa_zoneid, 3480 is->is_netstack->netstack_ip, B_FALSE, B_FALSE); 3481 } 3482 if (laddr_type != IPVL_UNICAST_UP) 3483 ixa->ixa_flags &= ~IXAF_VERIFY_SOURCE; 3484 } 3485 3486 ip_attr_nexthop(ipp, ixa, &v6dst, &v6nexthop); 3487 error = ip_attr_connect(connp, ixa, &v6src, &v6dst, &v6nexthop, dstport, 3488 &v6src, NULL, IPDF_ALLOW_MCBC | IPDF_VERIFY_DST); 3489 3490 switch (error) { 3491 case 0: 3492 break; 3493 case EADDRNOTAVAIL: 3494 /* 3495 * IXAF_VERIFY_SOURCE tells us to pick a better source. 3496 * Don't have the application see that errno 3497 */ 3498 error = ENETUNREACH; 3499 goto failed; 3500 case ENETDOWN: 3501 /* 3502 * Have !ipif_addr_ready address; drop packet silently 3503 * until we can get applications to not send until we 3504 * are ready. 3505 */ 3506 error = 0; 3507 goto failed; 3508 case EHOSTUNREACH: 3509 case ENETUNREACH: 3510 if (ixa->ixa_ire != NULL) { 3511 /* 3512 * Let conn_ip_output/ire_send_noroute return 3513 * the error and send any local ICMP error. 3514 */ 3515 error = 0; 3516 break; 3517 } 3518 /* FALLTHRU */ 3519 default: 3520 failed: 3521 freemsg(mp); 3522 BUMP_MIB(&is->is_rawip_mib, rawipOutErrors); 3523 goto done; 3524 } 3525 3526 /* 3527 * We might be going to a different destination than last time, 3528 * thus check that TX allows the communication and compute any 3529 * needed label. 3530 * 3531 * TSOL Note: We have an exclusive ipp and ixa for this thread so we 3532 * don't have to worry about concurrent threads. 3533 */ 3534 if (is_system_labeled()) { 3535 /* 3536 * Check whether Trusted Solaris policy allows communication 3537 * with this host, and pretend that the destination is 3538 * unreachable if not. 3539 * Compute any needed label and place it in ipp_label_v4/v6. 3540 * 3541 * Later conn_build_hdr_template/conn_prepend_hdr takes 3542 * ipp_label_v4/v6 to form the packet. 3543 * 3544 * Tsol note: We have ipp structure local to this thread so 3545 * no locking is needed. 3546 */ 3547 error = conn_update_label(connp, ixa, &v6dst, ipp); 3548 if (error != 0) { 3549 freemsg(mp); 3550 BUMP_MIB(&is->is_rawip_mib, rawipOutErrors); 3551 goto done; 3552 } 3553 } 3554 mp = icmp_prepend_hdr(connp, ixa, ipp, &v6src, &v6dst, flowinfo, mp, 3555 &error); 3556 if (mp == NULL) { 3557 BUMP_MIB(&is->is_rawip_mib, rawipOutErrors); 3558 ASSERT(error != 0); 3559 goto done; 3560 } 3561 if (ixa->ixa_pktlen > IP_MAXPACKET) { 3562 error = EMSGSIZE; 3563 BUMP_MIB(&is->is_rawip_mib, rawipOutErrors); 3564 freemsg(mp); 3565 goto done; 3566 } 3567 3568 /* Policy might differ for different ICMP type/code */ 3569 mp = icmp_output_attach_policy(mp, connp, ixa); 3570 if (mp == NULL) { 3571 BUMP_MIB(&is->is_rawip_mib, rawipOutErrors); 3572 error = EHOSTUNREACH; /* IPsec policy failure */ 3573 goto done; 3574 } 3575 3576 /* We're done. Pass the packet to ip. */ 3577 BUMP_MIB(&is->is_rawip_mib, rawipOutDatagrams); 3578 3579 error = conn_ip_output(mp, ixa); 3580 if (!connp->conn_unspec_src) 3581 ixa->ixa_flags |= IXAF_VERIFY_SOURCE; 3582 /* No rawipOutErrors if an error since IP increases its error counter */ 3583 switch (error) { 3584 case 0: 3585 break; 3586 case EWOULDBLOCK: 3587 (void) ixa_check_drain_insert(connp, ixa); 3588 error = 0; 3589 break; 3590 case EADDRNOTAVAIL: 3591 /* 3592 * IXAF_VERIFY_SOURCE tells us to pick a better source. 3593 * Don't have the application see that errno 3594 */ 3595 error = ENETUNREACH; 3596 /* FALLTHRU */ 3597 default: 3598 mutex_enter(&connp->conn_lock); 3599 /* 3600 * Clear the source and v6lastdst so we call ip_attr_connect 3601 * for the next packet and try to pick a better source. 3602 */ 3603 if (connp->conn_mcbc_bind) 3604 connp->conn_saddr_v6 = ipv6_all_zeros; 3605 else 3606 connp->conn_saddr_v6 = connp->conn_bound_addr_v6; 3607 connp->conn_v6lastdst = ipv6_all_zeros; 3608 mutex_exit(&connp->conn_lock); 3609 break; 3610 } 3611 done: 3612 ASSERT(!(ixa->ixa_free_flags & IXA_FREE_CRED)); 3613 ixa->ixa_cred = connp->conn_cred; /* Restore */ 3614 ixa->ixa_cpid = connp->conn_cpid; 3615 ixa_refrele(ixa); 3616 ip_pkt_free(ipp); 3617 kmem_free(ipp, sizeof (*ipp)); 3618 return (error); 3619 } 3620 3621 /* 3622 * Handle sending an M_DATA for a connected socket. 3623 * Handles both IPv4 and IPv6. 3624 */ 3625 int 3626 icmp_output_connected(conn_t *connp, mblk_t *mp, cred_t *cr, pid_t pid) 3627 { 3628 icmp_t *icmp = connp->conn_icmp; 3629 icmp_stack_t *is = icmp->icmp_is; 3630 int error; 3631 ip_xmit_attr_t *ixa; 3632 boolean_t do_ipsec; 3633 3634 /* 3635 * If no other thread is using conn_ixa this just gets a reference to 3636 * conn_ixa. Otherwise we get a safe copy of conn_ixa. 3637 */ 3638 ixa = conn_get_ixa(connp, B_FALSE); 3639 if (ixa == NULL) { 3640 BUMP_MIB(&is->is_rawip_mib, rawipOutErrors); 3641 freemsg(mp); 3642 return (ENOMEM); 3643 } 3644 3645 ASSERT(cr != NULL); 3646 ASSERT(!(ixa->ixa_free_flags & IXA_FREE_CRED)); 3647 ixa->ixa_cred = cr; 3648 ixa->ixa_cpid = pid; 3649 3650 /* Defer IPsec if it might need to look at ICMP type/code */ 3651 switch (ixa->ixa_protocol) { 3652 case IPPROTO_ICMP: 3653 case IPPROTO_ICMPV6: 3654 do_ipsec = B_FALSE; 3655 break; 3656 default: 3657 do_ipsec = B_TRUE; 3658 } 3659 3660 mutex_enter(&connp->conn_lock); 3661 mp = icmp_prepend_header_template(connp, ixa, mp, 3662 &connp->conn_saddr_v6, connp->conn_flowinfo, &error); 3663 3664 if (mp == NULL) { 3665 ASSERT(error != 0); 3666 mutex_exit(&connp->conn_lock); 3667 ASSERT(!(ixa->ixa_free_flags & IXA_FREE_CRED)); 3668 ixa->ixa_cred = connp->conn_cred; /* Restore */ 3669 ixa->ixa_cpid = connp->conn_cpid; 3670 ixa_refrele(ixa); 3671 BUMP_MIB(&is->is_rawip_mib, rawipOutErrors); 3672 freemsg(mp); 3673 return (error); 3674 } 3675 3676 if (!do_ipsec) { 3677 /* Policy might differ for different ICMP type/code */ 3678 mp = icmp_output_attach_policy(mp, connp, ixa); 3679 if (mp == NULL) { 3680 mutex_exit(&connp->conn_lock); 3681 BUMP_MIB(&is->is_rawip_mib, rawipOutErrors); 3682 ASSERT(!(ixa->ixa_free_flags & IXA_FREE_CRED)); 3683 ixa->ixa_cred = connp->conn_cred; /* Restore */ 3684 ixa->ixa_cpid = connp->conn_cpid; 3685 ixa_refrele(ixa); 3686 return (EHOSTUNREACH); /* IPsec policy failure */ 3687 } 3688 } 3689 3690 /* 3691 * In case we got a safe copy of conn_ixa, or if opt_set made us a new 3692 * safe copy, then we need to fill in any pointers in it. 3693 */ 3694 if (ixa->ixa_ire == NULL) { 3695 in6_addr_t faddr, saddr; 3696 in6_addr_t nexthop; 3697 in_port_t fport; 3698 3699 saddr = connp->conn_saddr_v6; 3700 faddr = connp->conn_faddr_v6; 3701 fport = connp->conn_fport; 3702 ip_attr_nexthop(&connp->conn_xmit_ipp, ixa, &faddr, &nexthop); 3703 mutex_exit(&connp->conn_lock); 3704 3705 error = ip_attr_connect(connp, ixa, &saddr, &faddr, &nexthop, 3706 fport, NULL, NULL, IPDF_ALLOW_MCBC | IPDF_VERIFY_DST | 3707 (do_ipsec ? IPDF_IPSEC : 0)); 3708 switch (error) { 3709 case 0: 3710 break; 3711 case EADDRNOTAVAIL: 3712 /* 3713 * IXAF_VERIFY_SOURCE tells us to pick a better source. 3714 * Don't have the application see that errno 3715 */ 3716 error = ENETUNREACH; 3717 goto failed; 3718 case ENETDOWN: 3719 /* 3720 * Have !ipif_addr_ready address; drop packet silently 3721 * until we can get applications to not send until we 3722 * are ready. 3723 */ 3724 error = 0; 3725 goto failed; 3726 case EHOSTUNREACH: 3727 case ENETUNREACH: 3728 if (ixa->ixa_ire != NULL) { 3729 /* 3730 * Let conn_ip_output/ire_send_noroute return 3731 * the error and send any local ICMP error. 3732 */ 3733 error = 0; 3734 break; 3735 } 3736 /* FALLTHRU */ 3737 default: 3738 failed: 3739 ASSERT(!(ixa->ixa_free_flags & IXA_FREE_CRED)); 3740 ixa->ixa_cred = connp->conn_cred; /* Restore */ 3741 ixa->ixa_cpid = connp->conn_cpid; 3742 ixa_refrele(ixa); 3743 BUMP_MIB(&is->is_rawip_mib, rawipOutErrors); 3744 freemsg(mp); 3745 return (error); 3746 } 3747 } else { 3748 /* Done with conn_t */ 3749 mutex_exit(&connp->conn_lock); 3750 } 3751 3752 /* We're done. Pass the packet to ip. */ 3753 BUMP_MIB(&is->is_rawip_mib, rawipOutDatagrams); 3754 3755 error = conn_ip_output(mp, ixa); 3756 /* No rawipOutErrors if an error since IP increases its error counter */ 3757 switch (error) { 3758 case 0: 3759 break; 3760 case EWOULDBLOCK: 3761 (void) ixa_check_drain_insert(connp, ixa); 3762 error = 0; 3763 break; 3764 case EADDRNOTAVAIL: 3765 /* 3766 * IXAF_VERIFY_SOURCE tells us to pick a better source. 3767 * Don't have the application see that errno 3768 */ 3769 error = ENETUNREACH; 3770 break; 3771 } 3772 ASSERT(!(ixa->ixa_free_flags & IXA_FREE_CRED)); 3773 ixa->ixa_cred = connp->conn_cred; /* Restore */ 3774 ixa->ixa_cpid = connp->conn_cpid; 3775 ixa_refrele(ixa); 3776 return (error); 3777 } 3778 3779 /* 3780 * Handle sending an M_DATA to the last destination. 3781 * Handles both IPv4 and IPv6. 3782 * 3783 * NOTE: The caller must hold conn_lock and we drop it here. 3784 */ 3785 int 3786 icmp_output_lastdst(conn_t *connp, mblk_t *mp, cred_t *cr, pid_t pid, 3787 ip_xmit_attr_t *ixa) 3788 { 3789 icmp_t *icmp = connp->conn_icmp; 3790 icmp_stack_t *is = icmp->icmp_is; 3791 int error; 3792 boolean_t do_ipsec; 3793 3794 ASSERT(MUTEX_HELD(&connp->conn_lock)); 3795 ASSERT(ixa != NULL); 3796 3797 ASSERT(cr != NULL); 3798 ASSERT(!(ixa->ixa_free_flags & IXA_FREE_CRED)); 3799 ixa->ixa_cred = cr; 3800 ixa->ixa_cpid = pid; 3801 3802 /* Defer IPsec if it might need to look at ICMP type/code */ 3803 switch (ixa->ixa_protocol) { 3804 case IPPROTO_ICMP: 3805 case IPPROTO_ICMPV6: 3806 do_ipsec = B_FALSE; 3807 break; 3808 default: 3809 do_ipsec = B_TRUE; 3810 } 3811 3812 3813 mp = icmp_prepend_header_template(connp, ixa, mp, 3814 &connp->conn_v6lastsrc, connp->conn_lastflowinfo, &error); 3815 3816 if (mp == NULL) { 3817 ASSERT(error != 0); 3818 mutex_exit(&connp->conn_lock); 3819 ASSERT(!(ixa->ixa_free_flags & IXA_FREE_CRED)); 3820 ixa->ixa_cred = connp->conn_cred; /* Restore */ 3821 ixa->ixa_cpid = connp->conn_cpid; 3822 ixa_refrele(ixa); 3823 BUMP_MIB(&is->is_rawip_mib, rawipOutErrors); 3824 freemsg(mp); 3825 return (error); 3826 } 3827 3828 if (!do_ipsec) { 3829 /* Policy might differ for different ICMP type/code */ 3830 mp = icmp_output_attach_policy(mp, connp, ixa); 3831 if (mp == NULL) { 3832 mutex_exit(&connp->conn_lock); 3833 BUMP_MIB(&is->is_rawip_mib, rawipOutErrors); 3834 ASSERT(!(ixa->ixa_free_flags & IXA_FREE_CRED)); 3835 ixa->ixa_cred = connp->conn_cred; /* Restore */ 3836 ixa->ixa_cpid = connp->conn_cpid; 3837 ixa_refrele(ixa); 3838 return (EHOSTUNREACH); /* IPsec policy failure */ 3839 } 3840 } 3841 3842 /* 3843 * In case we got a safe copy of conn_ixa, or if opt_set made us a new 3844 * safe copy, then we need to fill in any pointers in it. 3845 */ 3846 if (ixa->ixa_ire == NULL) { 3847 in6_addr_t lastdst, lastsrc; 3848 in6_addr_t nexthop; 3849 in_port_t lastport; 3850 3851 lastsrc = connp->conn_v6lastsrc; 3852 lastdst = connp->conn_v6lastdst; 3853 lastport = connp->conn_lastdstport; 3854 ip_attr_nexthop(&connp->conn_xmit_ipp, ixa, &lastdst, &nexthop); 3855 mutex_exit(&connp->conn_lock); 3856 3857 error = ip_attr_connect(connp, ixa, &lastsrc, &lastdst, 3858 &nexthop, lastport, NULL, NULL, IPDF_ALLOW_MCBC | 3859 IPDF_VERIFY_DST | (do_ipsec ? IPDF_IPSEC : 0)); 3860 switch (error) { 3861 case 0: 3862 break; 3863 case EADDRNOTAVAIL: 3864 /* 3865 * IXAF_VERIFY_SOURCE tells us to pick a better source. 3866 * Don't have the application see that errno 3867 */ 3868 error = ENETUNREACH; 3869 goto failed; 3870 case ENETDOWN: 3871 /* 3872 * Have !ipif_addr_ready address; drop packet silently 3873 * until we can get applications to not send until we 3874 * are ready. 3875 */ 3876 error = 0; 3877 goto failed; 3878 case EHOSTUNREACH: 3879 case ENETUNREACH: 3880 if (ixa->ixa_ire != NULL) { 3881 /* 3882 * Let conn_ip_output/ire_send_noroute return 3883 * the error and send any local ICMP error. 3884 */ 3885 error = 0; 3886 break; 3887 } 3888 /* FALLTHRU */ 3889 default: 3890 failed: 3891 ASSERT(!(ixa->ixa_free_flags & IXA_FREE_CRED)); 3892 ixa->ixa_cred = connp->conn_cred; /* Restore */ 3893 ixa->ixa_cpid = connp->conn_cpid; 3894 ixa_refrele(ixa); 3895 BUMP_MIB(&is->is_rawip_mib, rawipOutErrors); 3896 freemsg(mp); 3897 return (error); 3898 } 3899 } else { 3900 /* Done with conn_t */ 3901 mutex_exit(&connp->conn_lock); 3902 } 3903 3904 /* We're done. Pass the packet to ip. */ 3905 BUMP_MIB(&is->is_rawip_mib, rawipOutDatagrams); 3906 error = conn_ip_output(mp, ixa); 3907 /* No rawipOutErrors if an error since IP increases its error counter */ 3908 switch (error) { 3909 case 0: 3910 break; 3911 case EWOULDBLOCK: 3912 (void) ixa_check_drain_insert(connp, ixa); 3913 error = 0; 3914 break; 3915 case EADDRNOTAVAIL: 3916 /* 3917 * IXAF_VERIFY_SOURCE tells us to pick a better source. 3918 * Don't have the application see that errno 3919 */ 3920 error = ENETUNREACH; 3921 /* FALLTHRU */ 3922 default: 3923 mutex_enter(&connp->conn_lock); 3924 /* 3925 * Clear the source and v6lastdst so we call ip_attr_connect 3926 * for the next packet and try to pick a better source. 3927 */ 3928 if (connp->conn_mcbc_bind) 3929 connp->conn_saddr_v6 = ipv6_all_zeros; 3930 else 3931 connp->conn_saddr_v6 = connp->conn_bound_addr_v6; 3932 connp->conn_v6lastdst = ipv6_all_zeros; 3933 mutex_exit(&connp->conn_lock); 3934 break; 3935 } 3936 ASSERT(!(ixa->ixa_free_flags & IXA_FREE_CRED)); 3937 ixa->ixa_cred = connp->conn_cred; /* Restore */ 3938 ixa->ixa_cpid = connp->conn_cpid; 3939 ixa_refrele(ixa); 3940 return (error); 3941 } 3942 3943 3944 /* 3945 * Prepend the header template and then fill in the source and 3946 * flowinfo. The caller needs to handle the destination address since 3947 * it's setting is different if rthdr or source route. 3948 * 3949 * Returns NULL is allocation failed or if the packet would exceed IP_MAXPACKET. 3950 * When it returns NULL it sets errorp. 3951 */ 3952 static mblk_t * 3953 icmp_prepend_header_template(conn_t *connp, ip_xmit_attr_t *ixa, mblk_t *mp, 3954 const in6_addr_t *v6src, uint32_t flowinfo, int *errorp) 3955 { 3956 icmp_t *icmp = connp->conn_icmp; 3957 icmp_stack_t *is = icmp->icmp_is; 3958 uint_t pktlen; 3959 uint_t copylen; 3960 uint8_t *iph; 3961 uint_t ip_hdr_length; 3962 uint32_t cksum; 3963 ip_pkt_t *ipp; 3964 3965 ASSERT(MUTEX_HELD(&connp->conn_lock)); 3966 3967 /* 3968 * Copy the header template. 3969 */ 3970 copylen = connp->conn_ht_iphc_len; 3971 pktlen = copylen + msgdsize(mp); 3972 if (pktlen > IP_MAXPACKET) { 3973 freemsg(mp); 3974 *errorp = EMSGSIZE; 3975 return (NULL); 3976 } 3977 ixa->ixa_pktlen = pktlen; 3978 3979 /* check/fix buffer config, setup pointers into it */ 3980 iph = mp->b_rptr - copylen; 3981 if (DB_REF(mp) != 1 || iph < DB_BASE(mp) || !OK_32PTR(iph)) { 3982 mblk_t *mp1; 3983 3984 mp1 = allocb(copylen + is->is_wroff_extra, BPRI_MED); 3985 if (mp1 == NULL) { 3986 freemsg(mp); 3987 *errorp = ENOMEM; 3988 return (NULL); 3989 } 3990 mp1->b_wptr = DB_LIM(mp1); 3991 mp1->b_cont = mp; 3992 mp = mp1; 3993 iph = (mp->b_wptr - copylen); 3994 } 3995 mp->b_rptr = iph; 3996 bcopy(connp->conn_ht_iphc, iph, copylen); 3997 ip_hdr_length = (uint_t)(connp->conn_ht_ulp - connp->conn_ht_iphc); 3998 3999 ixa->ixa_ip_hdr_length = ip_hdr_length; 4000 4001 /* 4002 * Prepare for ICMPv6 checksum done in IP. 4003 * 4004 * icmp_build_hdr_template has already massaged any routing header 4005 * and placed the result in conn_sum. 4006 * 4007 * We make it easy for IP to include our pseudo header 4008 * by putting our length (and any routing header adjustment) 4009 * in the ICMPv6 checksum field. 4010 */ 4011 cksum = pktlen - ip_hdr_length; 4012 4013 cksum += connp->conn_sum; 4014 cksum = (cksum >> 16) + (cksum & 0xFFFF); 4015 ASSERT(cksum < 0x10000); 4016 4017 ipp = &connp->conn_xmit_ipp; 4018 if (ixa->ixa_flags & IXAF_IS_IPV4) { 4019 ipha_t *ipha = (ipha_t *)iph; 4020 4021 ipha->ipha_length = htons((uint16_t)pktlen); 4022 4023 /* if IP_PKTINFO specified an addres it wins over bind() */ 4024 if ((ipp->ipp_fields & IPPF_ADDR) && 4025 IN6_IS_ADDR_V4MAPPED(&ipp->ipp_addr)) { 4026 ASSERT(ipp->ipp_addr_v4 != INADDR_ANY); 4027 ipha->ipha_src = ipp->ipp_addr_v4; 4028 } else { 4029 IN6_V4MAPPED_TO_IPADDR(v6src, ipha->ipha_src); 4030 } 4031 } else { 4032 ip6_t *ip6h = (ip6_t *)iph; 4033 uint_t cksum_offset = 0; 4034 4035 ip6h->ip6_plen = htons((uint16_t)(pktlen - IPV6_HDR_LEN)); 4036 4037 /* if IP_PKTINFO specified an addres it wins over bind() */ 4038 if ((ipp->ipp_fields & IPPF_ADDR) && 4039 !IN6_IS_ADDR_V4MAPPED(&ipp->ipp_addr)) { 4040 ASSERT(!IN6_IS_ADDR_UNSPECIFIED(&ipp->ipp_addr)); 4041 ip6h->ip6_src = ipp->ipp_addr; 4042 } else { 4043 ip6h->ip6_src = *v6src; 4044 } 4045 ip6h->ip6_vcf = 4046 (IPV6_DEFAULT_VERS_AND_FLOW & IPV6_VERS_AND_FLOW_MASK) | 4047 (flowinfo & ~IPV6_VERS_AND_FLOW_MASK); 4048 if (ipp->ipp_fields & IPPF_TCLASS) { 4049 /* Overrides the class part of flowinfo */ 4050 ip6h->ip6_vcf = IPV6_TCLASS_FLOW(ip6h->ip6_vcf, 4051 ipp->ipp_tclass); 4052 } 4053 4054 if (ixa->ixa_flags & IXAF_SET_ULP_CKSUM) { 4055 if (connp->conn_proto == IPPROTO_ICMPV6) { 4056 cksum_offset = ixa->ixa_ip_hdr_length + 4057 offsetof(icmp6_t, icmp6_cksum); 4058 } else if (ixa->ixa_flags & IXAF_SET_RAW_CKSUM) { 4059 cksum_offset = ixa->ixa_ip_hdr_length + 4060 ixa->ixa_raw_cksum_offset; 4061 } 4062 } 4063 if (cksum_offset != 0) { 4064 uint16_t *ptr; 4065 4066 /* Make sure the checksum fits in the first mblk */ 4067 if (cksum_offset + sizeof (short) > MBLKL(mp)) { 4068 mblk_t *mp1; 4069 4070 mp1 = msgpullup(mp, 4071 cksum_offset + sizeof (short)); 4072 freemsg(mp); 4073 if (mp1 == NULL) { 4074 *errorp = ENOMEM; 4075 return (NULL); 4076 } 4077 mp = mp1; 4078 iph = mp->b_rptr; 4079 ip6h = (ip6_t *)iph; 4080 } 4081 ptr = (uint16_t *)(mp->b_rptr + cksum_offset); 4082 *ptr = htons(cksum); 4083 } 4084 } 4085 4086 return (mp); 4087 } 4088 4089 /* 4090 * This routine handles all messages passed downstream. It either 4091 * consumes the message or passes it downstream; it never queues a 4092 * a message. 4093 */ 4094 void 4095 icmp_wput(queue_t *q, mblk_t *mp) 4096 { 4097 sin6_t *sin6; 4098 sin_t *sin = NULL; 4099 uint_t srcid; 4100 conn_t *connp = Q_TO_CONN(q); 4101 icmp_t *icmp = connp->conn_icmp; 4102 int error = 0; 4103 struct sockaddr *addr = NULL; 4104 socklen_t addrlen; 4105 icmp_stack_t *is = icmp->icmp_is; 4106 struct T_unitdata_req *tudr; 4107 mblk_t *data_mp; 4108 cred_t *cr; 4109 pid_t pid; 4110 4111 /* 4112 * We directly handle several cases here: T_UNITDATA_REQ message 4113 * coming down as M_PROTO/M_PCPROTO and M_DATA messages for connected 4114 * socket. 4115 */ 4116 switch (DB_TYPE(mp)) { 4117 case M_DATA: 4118 /* sockfs never sends down M_DATA */ 4119 BUMP_MIB(&is->is_rawip_mib, rawipOutErrors); 4120 freemsg(mp); 4121 return; 4122 4123 case M_PROTO: 4124 case M_PCPROTO: 4125 tudr = (struct T_unitdata_req *)mp->b_rptr; 4126 if (MBLKL(mp) < sizeof (*tudr) || 4127 ((t_primp_t)mp->b_rptr)->type != T_UNITDATA_REQ) { 4128 icmp_wput_other(q, mp); 4129 return; 4130 } 4131 break; 4132 4133 default: 4134 icmp_wput_other(q, mp); 4135 return; 4136 } 4137 4138 /* Handle valid T_UNITDATA_REQ here */ 4139 data_mp = mp->b_cont; 4140 if (data_mp == NULL) { 4141 error = EPROTO; 4142 goto ud_error2; 4143 } 4144 mp->b_cont = NULL; 4145 4146 if (!MBLKIN(mp, 0, tudr->DEST_offset + tudr->DEST_length)) { 4147 error = EADDRNOTAVAIL; 4148 goto ud_error2; 4149 } 4150 4151 /* 4152 * All Solaris components should pass a db_credp 4153 * for this message, hence we ASSERT. 4154 * On production kernels we return an error to be robust against 4155 * random streams modules sitting on top of us. 4156 */ 4157 cr = msg_getcred(mp, &pid); 4158 ASSERT(cr != NULL); 4159 if (cr == NULL) { 4160 error = EINVAL; 4161 goto ud_error2; 4162 } 4163 4164 /* 4165 * If a port has not been bound to the stream, fail. 4166 * This is not a problem when sockfs is directly 4167 * above us, because it will ensure that the socket 4168 * is first bound before allowing data to be sent. 4169 */ 4170 if (icmp->icmp_state == TS_UNBND) { 4171 error = EPROTO; 4172 goto ud_error2; 4173 } 4174 addr = (struct sockaddr *)&mp->b_rptr[tudr->DEST_offset]; 4175 addrlen = tudr->DEST_length; 4176 4177 switch (connp->conn_family) { 4178 case AF_INET6: 4179 sin6 = (sin6_t *)addr; 4180 if (!OK_32PTR((char *)sin6) || (addrlen != sizeof (sin6_t)) || 4181 (sin6->sin6_family != AF_INET6)) { 4182 error = EADDRNOTAVAIL; 4183 goto ud_error2; 4184 } 4185 4186 /* No support for mapped addresses on raw sockets */ 4187 if (IN6_IS_ADDR_V4MAPPED(&sin6->sin6_addr)) { 4188 error = EADDRNOTAVAIL; 4189 goto ud_error2; 4190 } 4191 srcid = sin6->__sin6_src_id; 4192 4193 /* 4194 * If the local address is a mapped address return 4195 * an error. 4196 * It would be possible to send an IPv6 packet but the 4197 * response would never make it back to the application 4198 * since it is bound to a mapped address. 4199 */ 4200 if (IN6_IS_ADDR_V4MAPPED(&connp->conn_saddr_v6)) { 4201 error = EADDRNOTAVAIL; 4202 goto ud_error2; 4203 } 4204 4205 if (IN6_IS_ADDR_UNSPECIFIED(&sin6->sin6_addr)) 4206 sin6->sin6_addr = ipv6_loopback; 4207 4208 if (tudr->OPT_length != 0) { 4209 /* 4210 * If we are connected then the destination needs to be 4211 * the same as the connected one. 4212 */ 4213 if (icmp->icmp_state == TS_DATA_XFER && 4214 !conn_same_as_last_v6(connp, sin6)) { 4215 error = EISCONN; 4216 goto ud_error2; 4217 } 4218 error = icmp_output_ancillary(connp, NULL, sin6, 4219 data_mp, mp, NULL, cr, pid); 4220 } else { 4221 ip_xmit_attr_t *ixa; 4222 4223 /* 4224 * We have to allocate an ip_xmit_attr_t before we grab 4225 * conn_lock and we need to hold conn_lock once we've 4226 * checked conn_same_as_last_v6 to handle concurrent 4227 * send* calls on a socket. 4228 */ 4229 ixa = conn_get_ixa(connp, B_FALSE); 4230 if (ixa == NULL) { 4231 error = ENOMEM; 4232 goto ud_error2; 4233 } 4234 mutex_enter(&connp->conn_lock); 4235 4236 if (conn_same_as_last_v6(connp, sin6) && 4237 connp->conn_lastsrcid == srcid && 4238 ipsec_outbound_policy_current(ixa)) { 4239 /* icmp_output_lastdst drops conn_lock */ 4240 error = icmp_output_lastdst(connp, data_mp, cr, 4241 pid, ixa); 4242 } else { 4243 /* icmp_output_newdst drops conn_lock */ 4244 error = icmp_output_newdst(connp, data_mp, NULL, 4245 sin6, cr, pid, ixa); 4246 } 4247 ASSERT(MUTEX_NOT_HELD(&connp->conn_lock)); 4248 } 4249 if (error == 0) { 4250 freeb(mp); 4251 return; 4252 } 4253 break; 4254 4255 case AF_INET: 4256 sin = (sin_t *)addr; 4257 if ((!OK_32PTR((char *)sin) || addrlen != sizeof (sin_t)) || 4258 (sin->sin_family != AF_INET)) { 4259 error = EADDRNOTAVAIL; 4260 goto ud_error2; 4261 } 4262 if (sin->sin_addr.s_addr == INADDR_ANY) 4263 sin->sin_addr.s_addr = htonl(INADDR_LOOPBACK); 4264 4265 /* Protocol 255 contains full IP headers */ 4266 /* Read without holding lock */ 4267 if (icmp->icmp_hdrincl) { 4268 if (MBLKL(data_mp) < IP_SIMPLE_HDR_LENGTH) { 4269 if (!pullupmsg(data_mp, IP_SIMPLE_HDR_LENGTH)) { 4270 error = EINVAL; 4271 goto ud_error2; 4272 } 4273 } 4274 error = icmp_output_hdrincl(connp, data_mp, cr, pid); 4275 if (error == 0) { 4276 freeb(mp); 4277 return; 4278 } 4279 /* data_mp consumed above */ 4280 data_mp = NULL; 4281 goto ud_error2; 4282 } 4283 4284 if (tudr->OPT_length != 0) { 4285 /* 4286 * If we are connected then the destination needs to be 4287 * the same as the connected one. 4288 */ 4289 if (icmp->icmp_state == TS_DATA_XFER && 4290 !conn_same_as_last_v4(connp, sin)) { 4291 error = EISCONN; 4292 goto ud_error2; 4293 } 4294 error = icmp_output_ancillary(connp, sin, NULL, 4295 data_mp, mp, NULL, cr, pid); 4296 } else { 4297 ip_xmit_attr_t *ixa; 4298 4299 /* 4300 * We have to allocate an ip_xmit_attr_t before we grab 4301 * conn_lock and we need to hold conn_lock once we've 4302 * checked conn_same_as_last_v4 to handle concurrent 4303 * send* calls on a socket. 4304 */ 4305 ixa = conn_get_ixa(connp, B_FALSE); 4306 if (ixa == NULL) { 4307 error = ENOMEM; 4308 goto ud_error2; 4309 } 4310 mutex_enter(&connp->conn_lock); 4311 4312 if (conn_same_as_last_v4(connp, sin) && 4313 ipsec_outbound_policy_current(ixa)) { 4314 /* icmp_output_lastdst drops conn_lock */ 4315 error = icmp_output_lastdst(connp, data_mp, cr, 4316 pid, ixa); 4317 } else { 4318 /* icmp_output_newdst drops conn_lock */ 4319 error = icmp_output_newdst(connp, data_mp, sin, 4320 NULL, cr, pid, ixa); 4321 } 4322 ASSERT(MUTEX_NOT_HELD(&connp->conn_lock)); 4323 } 4324 if (error == 0) { 4325 freeb(mp); 4326 return; 4327 } 4328 break; 4329 } 4330 ASSERT(mp != NULL); 4331 /* mp is freed by the following routine */ 4332 icmp_ud_err(q, mp, (t_scalar_t)error); 4333 return; 4334 4335 ud_error2: 4336 BUMP_MIB(&is->is_rawip_mib, rawipOutErrors); 4337 freemsg(data_mp); 4338 ASSERT(mp != NULL); 4339 /* mp is freed by the following routine */ 4340 icmp_ud_err(q, mp, (t_scalar_t)error); 4341 } 4342 4343 /* 4344 * Handle the case of the IP address or flow label being different 4345 * for both IPv4 and IPv6. 4346 * 4347 * NOTE: The caller must hold conn_lock and we drop it here. 4348 */ 4349 static int 4350 icmp_output_newdst(conn_t *connp, mblk_t *data_mp, sin_t *sin, sin6_t *sin6, 4351 cred_t *cr, pid_t pid, ip_xmit_attr_t *ixa) 4352 { 4353 icmp_t *icmp = connp->conn_icmp; 4354 icmp_stack_t *is = icmp->icmp_is; 4355 int error; 4356 ip_xmit_attr_t *oldixa; 4357 boolean_t do_ipsec; 4358 uint_t srcid; 4359 uint32_t flowinfo; 4360 in6_addr_t v6src; 4361 in6_addr_t v6dst; 4362 in6_addr_t v6nexthop; 4363 in_port_t dstport; 4364 4365 ASSERT(MUTEX_HELD(&connp->conn_lock)); 4366 ASSERT(ixa != NULL); 4367 4368 /* 4369 * We hold conn_lock across all the use and modifications of 4370 * the conn_lastdst, conn_ixa, and conn_xmit_ipp to ensure that they 4371 * stay consistent. 4372 */ 4373 4374 ASSERT(cr != NULL); 4375 ASSERT(!(ixa->ixa_free_flags & IXA_FREE_CRED)); 4376 ixa->ixa_cred = cr; 4377 ixa->ixa_cpid = pid; 4378 if (is_system_labeled()) { 4379 /* We need to restart with a label based on the cred */ 4380 ip_xmit_attr_restore_tsl(ixa, ixa->ixa_cred); 4381 } 4382 /* 4383 * If we are connected then the destination needs to be the 4384 * same as the connected one, which is not the case here since we 4385 * checked for that above. 4386 */ 4387 if (icmp->icmp_state == TS_DATA_XFER) { 4388 mutex_exit(&connp->conn_lock); 4389 error = EISCONN; 4390 goto ud_error; 4391 } 4392 4393 /* In case previous destination was multicast or multirt */ 4394 ip_attr_newdst(ixa); 4395 4396 /* 4397 * If laddr is unspecified then we look at sin6_src_id. 4398 * We will give precedence to a source address set with IPV6_PKTINFO 4399 * (aka IPPF_ADDR) but that is handled in build_hdrs. However, we don't 4400 * want ip_attr_connect to select a source (since it can fail) when 4401 * IPV6_PKTINFO is specified. 4402 * If this doesn't result in a source address then we get a source 4403 * from ip_attr_connect() below. 4404 */ 4405 v6src = connp->conn_saddr_v6; 4406 if (sin != NULL) { 4407 IN6_IPADDR_TO_V4MAPPED(sin->sin_addr.s_addr, &v6dst); 4408 dstport = sin->sin_port; 4409 flowinfo = 0; 4410 srcid = 0; 4411 ixa->ixa_flags &= ~IXAF_SCOPEID_SET; 4412 if (srcid != 0 && V4_PART_OF_V6(&v6src) == INADDR_ANY) { 4413 ip_srcid_find_id(srcid, &v6src, IPCL_ZONEID(connp), 4414 connp->conn_netstack); 4415 } 4416 ixa->ixa_flags |= IXAF_IS_IPV4; 4417 } else { 4418 v6dst = sin6->sin6_addr; 4419 dstport = sin6->sin6_port; 4420 flowinfo = sin6->sin6_flowinfo; 4421 srcid = sin6->__sin6_src_id; 4422 if (IN6_IS_ADDR_LINKSCOPE(&v6dst) && sin6->sin6_scope_id != 0) { 4423 ixa->ixa_scopeid = sin6->sin6_scope_id; 4424 ixa->ixa_flags |= IXAF_SCOPEID_SET; 4425 } else { 4426 ixa->ixa_flags &= ~IXAF_SCOPEID_SET; 4427 } 4428 if (srcid != 0 && IN6_IS_ADDR_UNSPECIFIED(&v6src)) { 4429 ip_srcid_find_id(srcid, &v6src, IPCL_ZONEID(connp), 4430 connp->conn_netstack); 4431 } 4432 if (IN6_IS_ADDR_V4MAPPED(&v6dst)) 4433 ixa->ixa_flags |= IXAF_IS_IPV4; 4434 else 4435 ixa->ixa_flags &= ~IXAF_IS_IPV4; 4436 } 4437 /* Handle IPV6_PKTINFO setting source address. */ 4438 if (IN6_IS_ADDR_UNSPECIFIED(&v6src) && 4439 (connp->conn_xmit_ipp.ipp_fields & IPPF_ADDR)) { 4440 ip_pkt_t *ipp = &connp->conn_xmit_ipp; 4441 4442 if (ixa->ixa_flags & IXAF_IS_IPV4) { 4443 if (IN6_IS_ADDR_V4MAPPED(&ipp->ipp_addr)) 4444 v6src = ipp->ipp_addr; 4445 } else { 4446 if (!IN6_IS_ADDR_V4MAPPED(&ipp->ipp_addr)) 4447 v6src = ipp->ipp_addr; 4448 } 4449 } 4450 4451 /* Defer IPsec if it might need to look at ICMP type/code */ 4452 switch (ixa->ixa_protocol) { 4453 case IPPROTO_ICMP: 4454 case IPPROTO_ICMPV6: 4455 do_ipsec = B_FALSE; 4456 break; 4457 default: 4458 do_ipsec = B_TRUE; 4459 } 4460 4461 ip_attr_nexthop(&connp->conn_xmit_ipp, ixa, &v6dst, &v6nexthop); 4462 mutex_exit(&connp->conn_lock); 4463 4464 error = ip_attr_connect(connp, ixa, &v6src, &v6dst, &v6nexthop, dstport, 4465 &v6src, NULL, IPDF_ALLOW_MCBC | IPDF_VERIFY_DST | 4466 (do_ipsec ? IPDF_IPSEC : 0)); 4467 switch (error) { 4468 case 0: 4469 break; 4470 case EADDRNOTAVAIL: 4471 /* 4472 * IXAF_VERIFY_SOURCE tells us to pick a better source. 4473 * Don't have the application see that errno 4474 */ 4475 error = ENETUNREACH; 4476 goto failed; 4477 case ENETDOWN: 4478 /* 4479 * Have !ipif_addr_ready address; drop packet silently 4480 * until we can get applications to not send until we 4481 * are ready. 4482 */ 4483 error = 0; 4484 goto failed; 4485 case EHOSTUNREACH: 4486 case ENETUNREACH: 4487 if (ixa->ixa_ire != NULL) { 4488 /* 4489 * Let conn_ip_output/ire_send_noroute return 4490 * the error and send any local ICMP error. 4491 */ 4492 error = 0; 4493 break; 4494 } 4495 /* FALLTHRU */ 4496 default: 4497 failed: 4498 goto ud_error; 4499 } 4500 4501 mutex_enter(&connp->conn_lock); 4502 /* 4503 * While we dropped the lock some other thread might have connected 4504 * this socket. If so we bail out with EISCONN to ensure that the 4505 * connecting thread is the one that updates conn_ixa, conn_ht_* 4506 * and conn_*last*. 4507 */ 4508 if (icmp->icmp_state == TS_DATA_XFER) { 4509 mutex_exit(&connp->conn_lock); 4510 error = EISCONN; 4511 goto ud_error; 4512 } 4513 4514 /* 4515 * We need to rebuild the headers if 4516 * - we are labeling packets (could be different for different 4517 * destinations) 4518 * - we have a source route (or routing header) since we need to 4519 * massage that to get the pseudo-header checksum 4520 * - a socket option with COA_HEADER_CHANGED has been set which 4521 * set conn_v6lastdst to zero. 4522 * 4523 * Otherwise the prepend function will just update the src, dst, 4524 * and flow label. 4525 */ 4526 if (is_system_labeled()) { 4527 /* TX MLP requires SCM_UCRED and don't have that here */ 4528 if (connp->conn_mlp_type != mlptSingle) { 4529 mutex_exit(&connp->conn_lock); 4530 error = ECONNREFUSED; 4531 goto ud_error; 4532 } 4533 /* 4534 * Check whether Trusted Solaris policy allows communication 4535 * with this host, and pretend that the destination is 4536 * unreachable if not. 4537 * Compute any needed label and place it in ipp_label_v4/v6. 4538 * 4539 * Later conn_build_hdr_template/conn_prepend_hdr takes 4540 * ipp_label_v4/v6 to form the packet. 4541 * 4542 * Tsol note: Since we hold conn_lock we know no other 4543 * thread manipulates conn_xmit_ipp. 4544 */ 4545 error = conn_update_label(connp, ixa, &v6dst, 4546 &connp->conn_xmit_ipp); 4547 if (error != 0) { 4548 mutex_exit(&connp->conn_lock); 4549 goto ud_error; 4550 } 4551 /* Rebuild the header template */ 4552 error = icmp_build_hdr_template(connp, &v6src, &v6dst, 4553 flowinfo); 4554 if (error != 0) { 4555 mutex_exit(&connp->conn_lock); 4556 goto ud_error; 4557 } 4558 } else if (connp->conn_xmit_ipp.ipp_fields & 4559 (IPPF_IPV4_OPTIONS|IPPF_RTHDR) || 4560 IN6_IS_ADDR_UNSPECIFIED(&connp->conn_v6lastdst)) { 4561 /* Rebuild the header template */ 4562 error = icmp_build_hdr_template(connp, &v6src, &v6dst, 4563 flowinfo); 4564 if (error != 0) { 4565 mutex_exit(&connp->conn_lock); 4566 goto ud_error; 4567 } 4568 } else { 4569 /* Simply update the destination address if no source route */ 4570 if (ixa->ixa_flags & IXAF_IS_IPV4) { 4571 ipha_t *ipha = (ipha_t *)connp->conn_ht_iphc; 4572 4573 IN6_V4MAPPED_TO_IPADDR(&v6dst, ipha->ipha_dst); 4574 if (ixa->ixa_flags & IXAF_PMTU_IPV4_DF) { 4575 ipha->ipha_fragment_offset_and_flags |= 4576 IPH_DF_HTONS; 4577 } else { 4578 ipha->ipha_fragment_offset_and_flags &= 4579 ~IPH_DF_HTONS; 4580 } 4581 } else { 4582 ip6_t *ip6h = (ip6_t *)connp->conn_ht_iphc; 4583 ip6h->ip6_dst = v6dst; 4584 } 4585 } 4586 4587 /* 4588 * Remember the dst etc which corresponds to the built header 4589 * template and conn_ixa. 4590 */ 4591 oldixa = conn_replace_ixa(connp, ixa); 4592 connp->conn_v6lastdst = v6dst; 4593 connp->conn_lastflowinfo = flowinfo; 4594 connp->conn_lastscopeid = ixa->ixa_scopeid; 4595 connp->conn_lastsrcid = srcid; 4596 /* Also remember a source to use together with lastdst */ 4597 connp->conn_v6lastsrc = v6src; 4598 4599 data_mp = icmp_prepend_header_template(connp, ixa, data_mp, &v6src, 4600 flowinfo, &error); 4601 4602 /* Done with conn_t */ 4603 mutex_exit(&connp->conn_lock); 4604 ixa_refrele(oldixa); 4605 4606 if (data_mp == NULL) { 4607 ASSERT(error != 0); 4608 goto ud_error; 4609 } 4610 4611 if (!do_ipsec) { 4612 /* Policy might differ for different ICMP type/code */ 4613 data_mp = icmp_output_attach_policy(data_mp, connp, ixa); 4614 if (data_mp == NULL) { 4615 BUMP_MIB(&is->is_rawip_mib, rawipOutErrors); 4616 error = EHOSTUNREACH; /* IPsec policy failure */ 4617 goto done; 4618 } 4619 } 4620 4621 /* We're done. Pass the packet to ip. */ 4622 BUMP_MIB(&is->is_rawip_mib, rawipOutDatagrams); 4623 4624 error = conn_ip_output(data_mp, ixa); 4625 /* No rawipOutErrors if an error since IP increases its error counter */ 4626 switch (error) { 4627 case 0: 4628 break; 4629 case EWOULDBLOCK: 4630 (void) ixa_check_drain_insert(connp, ixa); 4631 error = 0; 4632 break; 4633 case EADDRNOTAVAIL: 4634 /* 4635 * IXAF_VERIFY_SOURCE tells us to pick a better source. 4636 * Don't have the application see that errno 4637 */ 4638 error = ENETUNREACH; 4639 /* FALLTHRU */ 4640 default: 4641 mutex_enter(&connp->conn_lock); 4642 /* 4643 * Clear the source and v6lastdst so we call ip_attr_connect 4644 * for the next packet and try to pick a better source. 4645 */ 4646 if (connp->conn_mcbc_bind) 4647 connp->conn_saddr_v6 = ipv6_all_zeros; 4648 else 4649 connp->conn_saddr_v6 = connp->conn_bound_addr_v6; 4650 connp->conn_v6lastdst = ipv6_all_zeros; 4651 mutex_exit(&connp->conn_lock); 4652 break; 4653 } 4654 done: 4655 ASSERT(!(ixa->ixa_free_flags & IXA_FREE_CRED)); 4656 ixa->ixa_cred = connp->conn_cred; /* Restore */ 4657 ixa->ixa_cpid = connp->conn_cpid; 4658 ixa_refrele(ixa); 4659 return (error); 4660 4661 ud_error: 4662 ASSERT(!(ixa->ixa_free_flags & IXA_FREE_CRED)); 4663 ixa->ixa_cred = connp->conn_cred; /* Restore */ 4664 ixa->ixa_cpid = connp->conn_cpid; 4665 ixa_refrele(ixa); 4666 4667 BUMP_MIB(&is->is_rawip_mib, rawipOutErrors); 4668 freemsg(data_mp); 4669 return (error); 4670 } 4671 4672 /* ARGSUSED */ 4673 static void 4674 icmp_wput_fallback(queue_t *q, mblk_t *mp) 4675 { 4676 #ifdef DEBUG 4677 cmn_err(CE_CONT, "icmp_wput_fallback: Message during fallback \n"); 4678 #endif 4679 freemsg(mp); 4680 } 4681 4682 static void 4683 icmp_wput_other(queue_t *q, mblk_t *mp) 4684 { 4685 uchar_t *rptr = mp->b_rptr; 4686 struct iocblk *iocp; 4687 conn_t *connp = Q_TO_CONN(q); 4688 icmp_t *icmp = connp->conn_icmp; 4689 cred_t *cr; 4690 4691 switch (mp->b_datap->db_type) { 4692 case M_PROTO: 4693 case M_PCPROTO: 4694 if (mp->b_wptr - rptr < sizeof (t_scalar_t)) { 4695 /* 4696 * If the message does not contain a PRIM_type, 4697 * throw it away. 4698 */ 4699 freemsg(mp); 4700 return; 4701 } 4702 switch (((t_primp_t)rptr)->type) { 4703 case T_ADDR_REQ: 4704 icmp_addr_req(q, mp); 4705 return; 4706 case O_T_BIND_REQ: 4707 case T_BIND_REQ: 4708 icmp_tpi_bind(q, mp); 4709 return; 4710 case T_CONN_REQ: 4711 icmp_tpi_connect(q, mp); 4712 return; 4713 case T_CAPABILITY_REQ: 4714 icmp_capability_req(q, mp); 4715 return; 4716 case T_INFO_REQ: 4717 icmp_info_req(q, mp); 4718 return; 4719 case T_UNITDATA_REQ: 4720 /* 4721 * If a T_UNITDATA_REQ gets here, the address must 4722 * be bad. Valid T_UNITDATA_REQs are handled 4723 * in icmp_wput. 4724 */ 4725 icmp_ud_err(q, mp, EADDRNOTAVAIL); 4726 return; 4727 case T_UNBIND_REQ: 4728 icmp_tpi_unbind(q, mp); 4729 return; 4730 case T_SVR4_OPTMGMT_REQ: 4731 /* 4732 * All Solaris components should pass a db_credp 4733 * for this TPI message, hence we ASSERT. 4734 * But in case there is some other M_PROTO that looks 4735 * like a TPI message sent by some other kernel 4736 * component, we check and return an error. 4737 */ 4738 cr = msg_getcred(mp, NULL); 4739 ASSERT(cr != NULL); 4740 if (cr == NULL) { 4741 icmp_err_ack(q, mp, TSYSERR, EINVAL); 4742 return; 4743 } 4744 4745 if (!snmpcom_req(q, mp, icmp_snmp_set, ip_snmp_get, 4746 cr)) { 4747 svr4_optcom_req(q, mp, cr, &icmp_opt_obj); 4748 } 4749 return; 4750 4751 case T_OPTMGMT_REQ: 4752 /* 4753 * All Solaris components should pass a db_credp 4754 * for this TPI message, hence we ASSERT. 4755 * But in case there is some other M_PROTO that looks 4756 * like a TPI message sent by some other kernel 4757 * component, we check and return an error. 4758 */ 4759 cr = msg_getcred(mp, NULL); 4760 ASSERT(cr != NULL); 4761 if (cr == NULL) { 4762 icmp_err_ack(q, mp, TSYSERR, EINVAL); 4763 return; 4764 } 4765 tpi_optcom_req(q, mp, cr, &icmp_opt_obj); 4766 return; 4767 4768 case T_DISCON_REQ: 4769 icmp_tpi_disconnect(q, mp); 4770 return; 4771 4772 /* The following TPI message is not supported by icmp. */ 4773 case O_T_CONN_RES: 4774 case T_CONN_RES: 4775 icmp_err_ack(q, mp, TNOTSUPPORT, 0); 4776 return; 4777 4778 /* The following 3 TPI requests are illegal for icmp. */ 4779 case T_DATA_REQ: 4780 case T_EXDATA_REQ: 4781 case T_ORDREL_REQ: 4782 icmp_err_ack(q, mp, TNOTSUPPORT, 0); 4783 return; 4784 default: 4785 break; 4786 } 4787 break; 4788 case M_FLUSH: 4789 if (*rptr & FLUSHW) 4790 flushq(q, FLUSHDATA); 4791 break; 4792 case M_IOCTL: 4793 iocp = (struct iocblk *)mp->b_rptr; 4794 switch (iocp->ioc_cmd) { 4795 case TI_GETPEERNAME: 4796 if (icmp->icmp_state != TS_DATA_XFER) { 4797 /* 4798 * If a default destination address has not 4799 * been associated with the stream, then we 4800 * don't know the peer's name. 4801 */ 4802 iocp->ioc_error = ENOTCONN; 4803 iocp->ioc_count = 0; 4804 mp->b_datap->db_type = M_IOCACK; 4805 qreply(q, mp); 4806 return; 4807 } 4808 /* FALLTHRU */ 4809 case TI_GETMYNAME: 4810 /* 4811 * For TI_GETPEERNAME and TI_GETMYNAME, we first 4812 * need to copyin the user's strbuf structure. 4813 * Processing will continue in the M_IOCDATA case 4814 * below. 4815 */ 4816 mi_copyin(q, mp, NULL, 4817 SIZEOF_STRUCT(strbuf, iocp->ioc_flag)); 4818 return; 4819 default: 4820 break; 4821 } 4822 break; 4823 case M_IOCDATA: 4824 icmp_wput_iocdata(q, mp); 4825 return; 4826 default: 4827 /* Unrecognized messages are passed through without change. */ 4828 break; 4829 } 4830 ip_wput_nondata(q, mp); 4831 } 4832 4833 /* 4834 * icmp_wput_iocdata is called by icmp_wput_other to handle all M_IOCDATA 4835 * messages. 4836 */ 4837 static void 4838 icmp_wput_iocdata(queue_t *q, mblk_t *mp) 4839 { 4840 mblk_t *mp1; 4841 STRUCT_HANDLE(strbuf, sb); 4842 uint_t addrlen; 4843 conn_t *connp = Q_TO_CONN(q); 4844 icmp_t *icmp = connp->conn_icmp; 4845 4846 /* Make sure it is one of ours. */ 4847 switch (((struct iocblk *)mp->b_rptr)->ioc_cmd) { 4848 case TI_GETMYNAME: 4849 case TI_GETPEERNAME: 4850 break; 4851 default: 4852 ip_wput_nondata(q, mp); 4853 return; 4854 } 4855 4856 switch (mi_copy_state(q, mp, &mp1)) { 4857 case -1: 4858 return; 4859 case MI_COPY_CASE(MI_COPY_IN, 1): 4860 break; 4861 case MI_COPY_CASE(MI_COPY_OUT, 1): 4862 /* 4863 * The address has been copied out, so now 4864 * copyout the strbuf. 4865 */ 4866 mi_copyout(q, mp); 4867 return; 4868 case MI_COPY_CASE(MI_COPY_OUT, 2): 4869 /* 4870 * The address and strbuf have been copied out. 4871 * We're done, so just acknowledge the original 4872 * M_IOCTL. 4873 */ 4874 mi_copy_done(q, mp, 0); 4875 return; 4876 default: 4877 /* 4878 * Something strange has happened, so acknowledge 4879 * the original M_IOCTL with an EPROTO error. 4880 */ 4881 mi_copy_done(q, mp, EPROTO); 4882 return; 4883 } 4884 4885 /* 4886 * Now we have the strbuf structure for TI_GETMYNAME 4887 * and TI_GETPEERNAME. Next we copyout the requested 4888 * address and then we'll copyout the strbuf. 4889 */ 4890 STRUCT_SET_HANDLE(sb, ((struct iocblk *)mp->b_rptr)->ioc_flag, 4891 (void *)mp1->b_rptr); 4892 4893 if (connp->conn_family == AF_INET) 4894 addrlen = sizeof (sin_t); 4895 else 4896 addrlen = sizeof (sin6_t); 4897 4898 if (STRUCT_FGET(sb, maxlen) < addrlen) { 4899 mi_copy_done(q, mp, EINVAL); 4900 return; 4901 } 4902 switch (((struct iocblk *)mp->b_rptr)->ioc_cmd) { 4903 case TI_GETMYNAME: 4904 break; 4905 case TI_GETPEERNAME: 4906 if (icmp->icmp_state != TS_DATA_XFER) { 4907 mi_copy_done(q, mp, ENOTCONN); 4908 return; 4909 } 4910 break; 4911 default: 4912 mi_copy_done(q, mp, EPROTO); 4913 return; 4914 } 4915 mp1 = mi_copyout_alloc(q, mp, STRUCT_FGETP(sb, buf), addrlen, B_TRUE); 4916 if (!mp1) 4917 return; 4918 4919 STRUCT_FSET(sb, len, addrlen); 4920 switch (((struct iocblk *)mp->b_rptr)->ioc_cmd) { 4921 case TI_GETMYNAME: 4922 (void) conn_getsockname(connp, (struct sockaddr *)mp1->b_wptr, 4923 &addrlen); 4924 break; 4925 case TI_GETPEERNAME: 4926 (void) conn_getpeername(connp, (struct sockaddr *)mp1->b_wptr, 4927 &addrlen); 4928 break; 4929 } 4930 mp1->b_wptr += addrlen; 4931 /* Copy out the address */ 4932 mi_copyout(q, mp); 4933 } 4934 4935 void 4936 icmp_ddi_g_init(void) 4937 { 4938 icmp_max_optsize = optcom_max_optsize(icmp_opt_obj.odb_opt_des_arr, 4939 icmp_opt_obj.odb_opt_arr_cnt); 4940 4941 /* 4942 * We want to be informed each time a stack is created or 4943 * destroyed in the kernel, so we can maintain the 4944 * set of icmp_stack_t's. 4945 */ 4946 netstack_register(NS_ICMP, rawip_stack_init, NULL, rawip_stack_fini); 4947 } 4948 4949 void 4950 icmp_ddi_g_destroy(void) 4951 { 4952 netstack_unregister(NS_ICMP); 4953 } 4954 4955 #define INET_NAME "ip" 4956 4957 /* 4958 * Initialize the ICMP stack instance. 4959 */ 4960 static void * 4961 rawip_stack_init(netstackid_t stackid, netstack_t *ns) 4962 { 4963 icmp_stack_t *is; 4964 int error = 0; 4965 size_t arrsz; 4966 major_t major; 4967 4968 is = (icmp_stack_t *)kmem_zalloc(sizeof (*is), KM_SLEEP); 4969 is->is_netstack = ns; 4970 4971 arrsz = sizeof (icmp_propinfo_tbl); 4972 is->is_propinfo_tbl = (mod_prop_info_t *)kmem_alloc(arrsz, KM_SLEEP); 4973 bcopy(icmp_propinfo_tbl, is->is_propinfo_tbl, arrsz); 4974 4975 is->is_ksp = rawip_kstat_init(stackid); 4976 4977 major = mod_name_to_major(INET_NAME); 4978 error = ldi_ident_from_major(major, &is->is_ldi_ident); 4979 ASSERT(error == 0); 4980 return (is); 4981 } 4982 4983 /* 4984 * Free the ICMP stack instance. 4985 */ 4986 static void 4987 rawip_stack_fini(netstackid_t stackid, void *arg) 4988 { 4989 icmp_stack_t *is = (icmp_stack_t *)arg; 4990 4991 kmem_free(is->is_propinfo_tbl, sizeof (icmp_propinfo_tbl)); 4992 is->is_propinfo_tbl = NULL; 4993 4994 rawip_kstat_fini(stackid, is->is_ksp); 4995 is->is_ksp = NULL; 4996 ldi_ident_release(is->is_ldi_ident); 4997 kmem_free(is, sizeof (*is)); 4998 } 4999 5000 static void * 5001 rawip_kstat_init(netstackid_t stackid) { 5002 kstat_t *ksp; 5003 5004 rawip_named_kstat_t template = { 5005 { "inDatagrams", KSTAT_DATA_UINT32, 0 }, 5006 { "inCksumErrs", KSTAT_DATA_UINT32, 0 }, 5007 { "inErrors", KSTAT_DATA_UINT32, 0 }, 5008 { "outDatagrams", KSTAT_DATA_UINT32, 0 }, 5009 { "outErrors", KSTAT_DATA_UINT32, 0 }, 5010 }; 5011 5012 ksp = kstat_create_netstack("icmp", 0, "rawip", "mib2", 5013 KSTAT_TYPE_NAMED, 5014 NUM_OF_FIELDS(rawip_named_kstat_t), 5015 0, stackid); 5016 if (ksp == NULL || ksp->ks_data == NULL) 5017 return (NULL); 5018 5019 bcopy(&template, ksp->ks_data, sizeof (template)); 5020 ksp->ks_update = rawip_kstat_update; 5021 ksp->ks_private = (void *)(uintptr_t)stackid; 5022 5023 kstat_install(ksp); 5024 return (ksp); 5025 } 5026 5027 static void 5028 rawip_kstat_fini(netstackid_t stackid, kstat_t *ksp) 5029 { 5030 if (ksp != NULL) { 5031 ASSERT(stackid == (netstackid_t)(uintptr_t)ksp->ks_private); 5032 kstat_delete_netstack(ksp, stackid); 5033 } 5034 } 5035 5036 static int 5037 rawip_kstat_update(kstat_t *ksp, int rw) 5038 { 5039 rawip_named_kstat_t *rawipkp; 5040 netstackid_t stackid = (netstackid_t)(uintptr_t)ksp->ks_private; 5041 netstack_t *ns; 5042 icmp_stack_t *is; 5043 5044 if ((ksp == NULL) || (ksp->ks_data == NULL)) 5045 return (EIO); 5046 5047 if (rw == KSTAT_WRITE) 5048 return (EACCES); 5049 5050 rawipkp = (rawip_named_kstat_t *)ksp->ks_data; 5051 5052 ns = netstack_find_by_stackid(stackid); 5053 if (ns == NULL) 5054 return (-1); 5055 is = ns->netstack_icmp; 5056 if (is == NULL) { 5057 netstack_rele(ns); 5058 return (-1); 5059 } 5060 rawipkp->inDatagrams.value.ui32 = is->is_rawip_mib.rawipInDatagrams; 5061 rawipkp->inCksumErrs.value.ui32 = is->is_rawip_mib.rawipInCksumErrs; 5062 rawipkp->inErrors.value.ui32 = is->is_rawip_mib.rawipInErrors; 5063 rawipkp->outDatagrams.value.ui32 = is->is_rawip_mib.rawipOutDatagrams; 5064 rawipkp->outErrors.value.ui32 = is->is_rawip_mib.rawipOutErrors; 5065 netstack_rele(ns); 5066 return (0); 5067 } 5068 5069 /* ARGSUSED */ 5070 int 5071 rawip_accept(sock_lower_handle_t lproto_handle, 5072 sock_lower_handle_t eproto_handle, sock_upper_handle_t sock_handle, 5073 cred_t *cr) 5074 { 5075 return (EOPNOTSUPP); 5076 } 5077 5078 /* ARGSUSED */ 5079 int 5080 rawip_bind(sock_lower_handle_t proto_handle, struct sockaddr *sa, 5081 socklen_t len, cred_t *cr) 5082 { 5083 conn_t *connp = (conn_t *)proto_handle; 5084 int error; 5085 5086 /* All Solaris components should pass a cred for this operation. */ 5087 ASSERT(cr != NULL); 5088 5089 /* Binding to a NULL address really means unbind */ 5090 if (sa == NULL) 5091 error = rawip_do_unbind(connp); 5092 else 5093 error = rawip_do_bind(connp, sa, len); 5094 5095 if (error < 0) { 5096 if (error == -TOUTSTATE) 5097 error = EINVAL; 5098 else 5099 error = proto_tlitosyserr(-error); 5100 } 5101 return (error); 5102 } 5103 5104 static int 5105 rawip_implicit_bind(conn_t *connp) 5106 { 5107 sin6_t sin6addr; 5108 sin_t *sin; 5109 sin6_t *sin6; 5110 socklen_t len; 5111 int error; 5112 5113 if (connp->conn_family == AF_INET) { 5114 len = sizeof (struct sockaddr_in); 5115 sin = (sin_t *)&sin6addr; 5116 *sin = sin_null; 5117 sin->sin_family = AF_INET; 5118 sin->sin_addr.s_addr = INADDR_ANY; 5119 } else { 5120 ASSERT(connp->conn_family == AF_INET6); 5121 len = sizeof (sin6_t); 5122 sin6 = (sin6_t *)&sin6addr; 5123 *sin6 = sin6_null; 5124 sin6->sin6_family = AF_INET6; 5125 V6_SET_ZERO(sin6->sin6_addr); 5126 } 5127 5128 error = rawip_do_bind(connp, (struct sockaddr *)&sin6addr, len); 5129 5130 return ((error < 0) ? proto_tlitosyserr(-error) : error); 5131 } 5132 5133 static int 5134 rawip_unbind(conn_t *connp) 5135 { 5136 int error; 5137 5138 error = rawip_do_unbind(connp); 5139 if (error < 0) { 5140 error = proto_tlitosyserr(-error); 5141 } 5142 return (error); 5143 } 5144 5145 /* ARGSUSED */ 5146 int 5147 rawip_listen(sock_lower_handle_t proto_handle, int backlog, cred_t *cr) 5148 { 5149 return (EOPNOTSUPP); 5150 } 5151 5152 int 5153 rawip_connect(sock_lower_handle_t proto_handle, const struct sockaddr *sa, 5154 socklen_t len, sock_connid_t *id, cred_t *cr) 5155 { 5156 conn_t *connp = (conn_t *)proto_handle; 5157 icmp_t *icmp = connp->conn_icmp; 5158 int error; 5159 boolean_t did_bind = B_FALSE; 5160 pid_t pid = curproc->p_pid; 5161 5162 /* All Solaris components should pass a cred for this operation. */ 5163 ASSERT(cr != NULL); 5164 5165 if (sa == NULL) { 5166 /* 5167 * Disconnect 5168 * Make sure we are connected 5169 */ 5170 if (icmp->icmp_state != TS_DATA_XFER) 5171 return (EINVAL); 5172 5173 error = icmp_disconnect(connp); 5174 return (error); 5175 } 5176 5177 error = proto_verify_ip_addr(connp->conn_family, sa, len); 5178 if (error != 0) 5179 return (error); 5180 5181 /* do an implicit bind if necessary */ 5182 if (icmp->icmp_state == TS_UNBND) { 5183 error = rawip_implicit_bind(connp); 5184 /* 5185 * We could be racing with an actual bind, in which case 5186 * we would see EPROTO. We cross our fingers and try 5187 * to connect. 5188 */ 5189 if (!(error == 0 || error == EPROTO)) 5190 return (error); 5191 did_bind = B_TRUE; 5192 } 5193 5194 /* 5195 * set SO_DGRAM_ERRIND 5196 */ 5197 connp->conn_dgram_errind = B_TRUE; 5198 5199 error = rawip_do_connect(connp, sa, len, cr, pid); 5200 if (error != 0 && did_bind) { 5201 int unbind_err; 5202 5203 unbind_err = rawip_unbind(connp); 5204 ASSERT(unbind_err == 0); 5205 } 5206 5207 if (error == 0) { 5208 *id = 0; 5209 (*connp->conn_upcalls->su_connected)(connp->conn_upper_handle, 5210 0, NULL, -1); 5211 } else if (error < 0) { 5212 error = proto_tlitosyserr(-error); 5213 } 5214 return (error); 5215 } 5216 5217 /* ARGSUSED2 */ 5218 int 5219 rawip_fallback(sock_lower_handle_t proto_handle, queue_t *q, 5220 boolean_t direct_sockfs, so_proto_quiesced_cb_t quiesced_cb) 5221 { 5222 conn_t *connp = (conn_t *)proto_handle; 5223 icmp_t *icmp; 5224 struct T_capability_ack tca; 5225 struct sockaddr_in6 laddr, faddr; 5226 socklen_t laddrlen, faddrlen; 5227 short opts; 5228 struct stroptions *stropt; 5229 mblk_t *stropt_mp; 5230 int error; 5231 5232 icmp = connp->conn_icmp; 5233 5234 stropt_mp = allocb_wait(sizeof (*stropt), BPRI_HI, STR_NOSIG, NULL); 5235 5236 /* 5237 * setup the fallback stream that was allocated 5238 */ 5239 connp->conn_dev = (dev_t)RD(q)->q_ptr; 5240 connp->conn_minor_arena = WR(q)->q_ptr; 5241 5242 RD(q)->q_ptr = WR(q)->q_ptr = connp; 5243 5244 WR(q)->q_qinfo = &icmpwinit; 5245 5246 connp->conn_rq = RD(q); 5247 connp->conn_wq = WR(q); 5248 5249 /* Notify stream head about options before sending up data */ 5250 stropt_mp->b_datap->db_type = M_SETOPTS; 5251 stropt_mp->b_wptr += sizeof (*stropt); 5252 stropt = (struct stroptions *)stropt_mp->b_rptr; 5253 stropt->so_flags = SO_WROFF | SO_HIWAT; 5254 stropt->so_wroff = connp->conn_wroff; 5255 stropt->so_hiwat = connp->conn_rcvbuf; 5256 putnext(RD(q), stropt_mp); 5257 5258 /* 5259 * free helper stream 5260 */ 5261 ip_free_helper_stream(connp); 5262 5263 /* 5264 * Collect the information needed to sync with the sonode 5265 */ 5266 icmp_do_capability_ack(icmp, &tca, TC1_INFO); 5267 5268 laddrlen = faddrlen = sizeof (sin6_t); 5269 (void) rawip_getsockname((sock_lower_handle_t)connp, 5270 (struct sockaddr *)&laddr, &laddrlen, CRED()); 5271 error = rawip_getpeername((sock_lower_handle_t)connp, 5272 (struct sockaddr *)&faddr, &faddrlen, CRED()); 5273 if (error != 0) 5274 faddrlen = 0; 5275 opts = 0; 5276 if (connp->conn_dgram_errind) 5277 opts |= SO_DGRAM_ERRIND; 5278 if (connp->conn_ixa->ixa_flags & IXAF_DONTROUTE) 5279 opts |= SO_DONTROUTE; 5280 5281 (*quiesced_cb)(connp->conn_upper_handle, q, &tca, 5282 (struct sockaddr *)&laddr, laddrlen, 5283 (struct sockaddr *)&faddr, faddrlen, opts); 5284 5285 /* 5286 * Attempts to send data up during fallback will result in it being 5287 * queued in icmp_t. Now we push up any queued packets. 5288 */ 5289 mutex_enter(&icmp->icmp_recv_lock); 5290 while (icmp->icmp_fallback_queue_head != NULL) { 5291 mblk_t *mp; 5292 5293 mp = icmp->icmp_fallback_queue_head; 5294 icmp->icmp_fallback_queue_head = mp->b_next; 5295 mp->b_next = NULL; 5296 mutex_exit(&icmp->icmp_recv_lock); 5297 putnext(RD(q), mp); 5298 mutex_enter(&icmp->icmp_recv_lock); 5299 } 5300 icmp->icmp_fallback_queue_tail = icmp->icmp_fallback_queue_head; 5301 5302 /* 5303 * No longer a streams less socket 5304 */ 5305 mutex_enter(&connp->conn_lock); 5306 connp->conn_flags &= ~IPCL_NONSTR; 5307 mutex_exit(&connp->conn_lock); 5308 5309 mutex_exit(&icmp->icmp_recv_lock); 5310 5311 ASSERT(icmp->icmp_fallback_queue_head == NULL && 5312 icmp->icmp_fallback_queue_tail == NULL); 5313 5314 ASSERT(connp->conn_ref >= 1); 5315 5316 return (0); 5317 } 5318 5319 /* ARGSUSED2 */ 5320 sock_lower_handle_t 5321 rawip_create(int family, int type, int proto, sock_downcalls_t **sock_downcalls, 5322 uint_t *smodep, int *errorp, int flags, cred_t *credp) 5323 { 5324 conn_t *connp; 5325 5326 if (type != SOCK_RAW || (family != AF_INET && family != AF_INET6)) { 5327 *errorp = EPROTONOSUPPORT; 5328 return (NULL); 5329 } 5330 5331 connp = rawip_do_open(family, credp, errorp, flags); 5332 if (connp != NULL) { 5333 connp->conn_flags |= IPCL_NONSTR; 5334 5335 mutex_enter(&connp->conn_lock); 5336 connp->conn_state_flags &= ~CONN_INCIPIENT; 5337 mutex_exit(&connp->conn_lock); 5338 *sock_downcalls = &sock_rawip_downcalls; 5339 *smodep = SM_ATOMIC; 5340 } else { 5341 ASSERT(*errorp != 0); 5342 } 5343 5344 return ((sock_lower_handle_t)connp); 5345 } 5346 5347 /* ARGSUSED3 */ 5348 void 5349 rawip_activate(sock_lower_handle_t proto_handle, 5350 sock_upper_handle_t sock_handle, sock_upcalls_t *sock_upcalls, int flags, 5351 cred_t *cr) 5352 { 5353 conn_t *connp = (conn_t *)proto_handle; 5354 struct sock_proto_props sopp; 5355 5356 /* All Solaris components should pass a cred for this operation. */ 5357 ASSERT(cr != NULL); 5358 5359 connp->conn_upcalls = sock_upcalls; 5360 connp->conn_upper_handle = sock_handle; 5361 5362 sopp.sopp_flags = SOCKOPT_WROFF | SOCKOPT_RCVHIWAT | SOCKOPT_RCVLOWAT | 5363 SOCKOPT_MAXBLK | SOCKOPT_MAXPSZ | SOCKOPT_MINPSZ; 5364 sopp.sopp_wroff = connp->conn_wroff; 5365 sopp.sopp_rxhiwat = connp->conn_rcvbuf; 5366 sopp.sopp_rxlowat = connp->conn_rcvlowat; 5367 sopp.sopp_maxblk = INFPSZ; 5368 sopp.sopp_maxpsz = IP_MAXPACKET; 5369 sopp.sopp_minpsz = (icmp_mod_info.mi_minpsz == 1) ? 0 : 5370 icmp_mod_info.mi_minpsz; 5371 5372 (*connp->conn_upcalls->su_set_proto_props) 5373 (connp->conn_upper_handle, &sopp); 5374 5375 icmp_bind_proto(connp->conn_icmp); 5376 } 5377 5378 /* ARGSUSED3 */ 5379 int 5380 rawip_getpeername(sock_lower_handle_t proto_handle, struct sockaddr *sa, 5381 socklen_t *salenp, cred_t *cr) 5382 { 5383 conn_t *connp = (conn_t *)proto_handle; 5384 icmp_t *icmp = connp->conn_icmp; 5385 int error; 5386 5387 /* All Solaris components should pass a cred for this operation. */ 5388 ASSERT(cr != NULL); 5389 5390 mutex_enter(&connp->conn_lock); 5391 if (icmp->icmp_state != TS_DATA_XFER) 5392 error = ENOTCONN; 5393 else 5394 error = conn_getpeername(connp, sa, salenp); 5395 mutex_exit(&connp->conn_lock); 5396 return (error); 5397 } 5398 5399 /* ARGSUSED3 */ 5400 int 5401 rawip_getsockname(sock_lower_handle_t proto_handle, struct sockaddr *sa, 5402 socklen_t *salenp, cred_t *cr) 5403 { 5404 conn_t *connp = (conn_t *)proto_handle; 5405 int error; 5406 5407 /* All Solaris components should pass a cred for this operation. */ 5408 ASSERT(cr != NULL); 5409 5410 mutex_enter(&connp->conn_lock); 5411 error = conn_getsockname(connp, sa, salenp); 5412 mutex_exit(&connp->conn_lock); 5413 return (error); 5414 } 5415 5416 int 5417 rawip_setsockopt(sock_lower_handle_t proto_handle, int level, int option_name, 5418 const void *optvalp, socklen_t optlen, cred_t *cr) 5419 { 5420 conn_t *connp = (conn_t *)proto_handle; 5421 int error; 5422 5423 /* All Solaris components should pass a cred for this operation. */ 5424 ASSERT(cr != NULL); 5425 5426 error = proto_opt_check(level, option_name, optlen, NULL, 5427 icmp_opt_obj.odb_opt_des_arr, 5428 icmp_opt_obj.odb_opt_arr_cnt, 5429 B_TRUE, B_FALSE, cr); 5430 5431 if (error != 0) { 5432 /* 5433 * option not recognized 5434 */ 5435 if (error < 0) { 5436 error = proto_tlitosyserr(-error); 5437 } 5438 return (error); 5439 } 5440 5441 error = icmp_opt_set(connp, SETFN_OPTCOM_NEGOTIATE, level, 5442 option_name, optlen, (uchar_t *)optvalp, (uint_t *)&optlen, 5443 (uchar_t *)optvalp, NULL, cr); 5444 5445 ASSERT(error >= 0); 5446 5447 return (error); 5448 } 5449 5450 int 5451 rawip_getsockopt(sock_lower_handle_t proto_handle, int level, int option_name, 5452 void *optvalp, socklen_t *optlen, cred_t *cr) 5453 { 5454 int error; 5455 conn_t *connp = (conn_t *)proto_handle; 5456 t_uscalar_t max_optbuf_len; 5457 void *optvalp_buf; 5458 int len; 5459 5460 /* All Solaris components should pass a cred for this operation. */ 5461 ASSERT(cr != NULL); 5462 5463 error = proto_opt_check(level, option_name, *optlen, &max_optbuf_len, 5464 icmp_opt_obj.odb_opt_des_arr, 5465 icmp_opt_obj.odb_opt_arr_cnt, 5466 B_FALSE, B_TRUE, cr); 5467 5468 if (error != 0) { 5469 if (error < 0) { 5470 error = proto_tlitosyserr(-error); 5471 } 5472 return (error); 5473 } 5474 5475 optvalp_buf = kmem_alloc(max_optbuf_len, KM_SLEEP); 5476 len = icmp_opt_get(connp, level, option_name, optvalp_buf); 5477 if (len == -1) { 5478 kmem_free(optvalp_buf, max_optbuf_len); 5479 return (EINVAL); 5480 } 5481 5482 /* 5483 * update optlen and copy option value 5484 */ 5485 t_uscalar_t size = MIN(len, *optlen); 5486 5487 bcopy(optvalp_buf, optvalp, size); 5488 bcopy(&size, optlen, sizeof (size)); 5489 5490 kmem_free(optvalp_buf, max_optbuf_len); 5491 return (0); 5492 } 5493 5494 /* ARGSUSED1 */ 5495 int 5496 rawip_close(sock_lower_handle_t proto_handle, int flags, cred_t *cr) 5497 { 5498 conn_t *connp = (conn_t *)proto_handle; 5499 5500 /* All Solaris components should pass a cred for this operation. */ 5501 ASSERT(cr != NULL); 5502 5503 (void) rawip_do_close(connp); 5504 return (0); 5505 } 5506 5507 /* ARGSUSED2 */ 5508 int 5509 rawip_shutdown(sock_lower_handle_t proto_handle, int how, cred_t *cr) 5510 { 5511 conn_t *connp = (conn_t *)proto_handle; 5512 5513 /* All Solaris components should pass a cred for this operation. */ 5514 ASSERT(cr != NULL); 5515 5516 /* shut down the send side */ 5517 if (how != SHUT_RD) 5518 (*connp->conn_upcalls->su_opctl)(connp->conn_upper_handle, 5519 SOCK_OPCTL_SHUT_SEND, 0); 5520 /* shut down the recv side */ 5521 if (how != SHUT_WR) 5522 (*connp->conn_upcalls->su_opctl)(connp->conn_upper_handle, 5523 SOCK_OPCTL_SHUT_RECV, 0); 5524 return (0); 5525 } 5526 5527 void 5528 rawip_clr_flowctrl(sock_lower_handle_t proto_handle) 5529 { 5530 conn_t *connp = (conn_t *)proto_handle; 5531 icmp_t *icmp = connp->conn_icmp; 5532 5533 mutex_enter(&icmp->icmp_recv_lock); 5534 connp->conn_flow_cntrld = B_FALSE; 5535 mutex_exit(&icmp->icmp_recv_lock); 5536 } 5537 5538 int 5539 rawip_ioctl(sock_lower_handle_t proto_handle, int cmd, intptr_t arg, 5540 int mode, int32_t *rvalp, cred_t *cr) 5541 { 5542 conn_t *connp = (conn_t *)proto_handle; 5543 int error; 5544 5545 /* All Solaris components should pass a cred for this operation. */ 5546 ASSERT(cr != NULL); 5547 5548 /* 5549 * If we don't have a helper stream then create one. 5550 * ip_create_helper_stream takes care of locking the conn_t, 5551 * so this check for NULL is just a performance optimization. 5552 */ 5553 if (connp->conn_helper_info == NULL) { 5554 icmp_stack_t *is = connp->conn_icmp->icmp_is; 5555 5556 ASSERT(is->is_ldi_ident != NULL); 5557 5558 /* 5559 * Create a helper stream for non-STREAMS socket. 5560 */ 5561 error = ip_create_helper_stream(connp, is->is_ldi_ident); 5562 if (error != 0) { 5563 ip0dbg(("rawip_ioctl: create of IP helper stream " 5564 "failed %d\n", error)); 5565 return (error); 5566 } 5567 } 5568 5569 switch (cmd) { 5570 case _SIOCSOCKFALLBACK: 5571 case TI_GETPEERNAME: 5572 case TI_GETMYNAME: 5573 #ifdef DEBUG 5574 cmn_err(CE_CONT, "icmp_ioctl cmd 0x%x on non streams" 5575 " socket", cmd); 5576 #endif 5577 error = EINVAL; 5578 break; 5579 default: 5580 /* 5581 * Pass on to IP using helper stream 5582 */ 5583 error = ldi_ioctl(connp->conn_helper_info->iphs_handle, 5584 cmd, arg, mode, cr, rvalp); 5585 break; 5586 } 5587 return (error); 5588 } 5589 5590 int 5591 rawip_send(sock_lower_handle_t proto_handle, mblk_t *mp, struct nmsghdr *msg, 5592 cred_t *cr) 5593 { 5594 sin6_t *sin6; 5595 sin_t *sin = NULL; 5596 uint_t srcid; 5597 conn_t *connp = (conn_t *)proto_handle; 5598 icmp_t *icmp = connp->conn_icmp; 5599 int error = 0; 5600 icmp_stack_t *is = icmp->icmp_is; 5601 pid_t pid = curproc->p_pid; 5602 ip_xmit_attr_t *ixa; 5603 5604 ASSERT(DB_TYPE(mp) == M_DATA); 5605 5606 /* All Solaris components should pass a cred for this operation. */ 5607 ASSERT(cr != NULL); 5608 5609 /* do an implicit bind if necessary */ 5610 if (icmp->icmp_state == TS_UNBND) { 5611 error = rawip_implicit_bind(connp); 5612 /* 5613 * We could be racing with an actual bind, in which case 5614 * we would see EPROTO. We cross our fingers and try 5615 * to connect. 5616 */ 5617 if (!(error == 0 || error == EPROTO)) { 5618 freemsg(mp); 5619 return (error); 5620 } 5621 } 5622 5623 /* Protocol 255 contains full IP headers */ 5624 /* Read without holding lock */ 5625 if (icmp->icmp_hdrincl) { 5626 ASSERT(connp->conn_ipversion == IPV4_VERSION); 5627 if (mp->b_wptr - mp->b_rptr < IP_SIMPLE_HDR_LENGTH) { 5628 if (!pullupmsg(mp, IP_SIMPLE_HDR_LENGTH)) { 5629 BUMP_MIB(&is->is_rawip_mib, rawipOutErrors); 5630 freemsg(mp); 5631 return (EINVAL); 5632 } 5633 } 5634 error = icmp_output_hdrincl(connp, mp, cr, pid); 5635 if (is->is_sendto_ignerr) 5636 return (0); 5637 else 5638 return (error); 5639 } 5640 5641 /* Connected? */ 5642 if (msg->msg_name == NULL) { 5643 if (icmp->icmp_state != TS_DATA_XFER) { 5644 BUMP_MIB(&is->is_rawip_mib, rawipOutErrors); 5645 return (EDESTADDRREQ); 5646 } 5647 if (msg->msg_controllen != 0) { 5648 error = icmp_output_ancillary(connp, NULL, NULL, mp, 5649 NULL, msg, cr, pid); 5650 } else { 5651 error = icmp_output_connected(connp, mp, cr, pid); 5652 } 5653 if (is->is_sendto_ignerr) 5654 return (0); 5655 else 5656 return (error); 5657 } 5658 if (icmp->icmp_state == TS_DATA_XFER) { 5659 BUMP_MIB(&is->is_rawip_mib, rawipOutErrors); 5660 return (EISCONN); 5661 } 5662 error = proto_verify_ip_addr(connp->conn_family, 5663 (struct sockaddr *)msg->msg_name, msg->msg_namelen); 5664 if (error != 0) { 5665 BUMP_MIB(&is->is_rawip_mib, rawipOutErrors); 5666 return (error); 5667 } 5668 switch (connp->conn_family) { 5669 case AF_INET6: 5670 sin6 = (sin6_t *)msg->msg_name; 5671 5672 /* No support for mapped addresses on raw sockets */ 5673 if (IN6_IS_ADDR_V4MAPPED(&sin6->sin6_addr)) { 5674 BUMP_MIB(&is->is_rawip_mib, rawipOutErrors); 5675 return (EADDRNOTAVAIL); 5676 } 5677 srcid = sin6->__sin6_src_id; 5678 5679 /* 5680 * If the local address is a mapped address return 5681 * an error. 5682 * It would be possible to send an IPv6 packet but the 5683 * response would never make it back to the application 5684 * since it is bound to a mapped address. 5685 */ 5686 if (IN6_IS_ADDR_V4MAPPED(&connp->conn_saddr_v6)) { 5687 BUMP_MIB(&is->is_rawip_mib, rawipOutErrors); 5688 return (EADDRNOTAVAIL); 5689 } 5690 5691 if (IN6_IS_ADDR_UNSPECIFIED(&sin6->sin6_addr)) 5692 sin6->sin6_addr = ipv6_loopback; 5693 5694 /* 5695 * We have to allocate an ip_xmit_attr_t before we grab 5696 * conn_lock and we need to hold conn_lock once we've check 5697 * conn_same_as_last_v6 to handle concurrent send* calls on a 5698 * socket. 5699 */ 5700 if (msg->msg_controllen == 0) { 5701 ixa = conn_get_ixa(connp, B_FALSE); 5702 if (ixa == NULL) { 5703 BUMP_MIB(&is->is_rawip_mib, rawipOutErrors); 5704 return (ENOMEM); 5705 } 5706 } else { 5707 ixa = NULL; 5708 } 5709 mutex_enter(&connp->conn_lock); 5710 if (icmp->icmp_delayed_error != 0) { 5711 sin6_t *sin2 = (sin6_t *)&icmp->icmp_delayed_addr; 5712 5713 error = icmp->icmp_delayed_error; 5714 icmp->icmp_delayed_error = 0; 5715 5716 /* Compare IP address and family */ 5717 5718 if (IN6_ARE_ADDR_EQUAL(&sin6->sin6_addr, 5719 &sin2->sin6_addr) && 5720 sin6->sin6_family == sin2->sin6_family) { 5721 mutex_exit(&connp->conn_lock); 5722 BUMP_MIB(&is->is_rawip_mib, rawipOutErrors); 5723 if (ixa != NULL) 5724 ixa_refrele(ixa); 5725 return (error); 5726 } 5727 } 5728 if (msg->msg_controllen != 0) { 5729 mutex_exit(&connp->conn_lock); 5730 ASSERT(ixa == NULL); 5731 error = icmp_output_ancillary(connp, NULL, sin6, mp, 5732 NULL, msg, cr, pid); 5733 } else if (conn_same_as_last_v6(connp, sin6) && 5734 connp->conn_lastsrcid == srcid && 5735 ipsec_outbound_policy_current(ixa)) { 5736 /* icmp_output_lastdst drops conn_lock */ 5737 error = icmp_output_lastdst(connp, mp, cr, pid, ixa); 5738 } else { 5739 /* icmp_output_newdst drops conn_lock */ 5740 error = icmp_output_newdst(connp, mp, NULL, sin6, cr, 5741 pid, ixa); 5742 } 5743 ASSERT(MUTEX_NOT_HELD(&connp->conn_lock)); 5744 if (is->is_sendto_ignerr) 5745 return (0); 5746 else 5747 return (error); 5748 case AF_INET: 5749 sin = (sin_t *)msg->msg_name; 5750 5751 if (sin->sin_addr.s_addr == INADDR_ANY) 5752 sin->sin_addr.s_addr = htonl(INADDR_LOOPBACK); 5753 5754 /* 5755 * We have to allocate an ip_xmit_attr_t before we grab 5756 * conn_lock and we need to hold conn_lock once we've check 5757 * conn_same_as_last_v6 to handle concurrent send* on a socket. 5758 */ 5759 if (msg->msg_controllen == 0) { 5760 ixa = conn_get_ixa(connp, B_FALSE); 5761 if (ixa == NULL) { 5762 BUMP_MIB(&is->is_rawip_mib, rawipOutErrors); 5763 return (ENOMEM); 5764 } 5765 } else { 5766 ixa = NULL; 5767 } 5768 mutex_enter(&connp->conn_lock); 5769 if (icmp->icmp_delayed_error != 0) { 5770 sin_t *sin2 = (sin_t *)&icmp->icmp_delayed_addr; 5771 5772 error = icmp->icmp_delayed_error; 5773 icmp->icmp_delayed_error = 0; 5774 5775 /* Compare IP address */ 5776 5777 if (sin->sin_addr.s_addr == sin2->sin_addr.s_addr) { 5778 mutex_exit(&connp->conn_lock); 5779 BUMP_MIB(&is->is_rawip_mib, rawipOutErrors); 5780 if (ixa != NULL) 5781 ixa_refrele(ixa); 5782 return (error); 5783 } 5784 } 5785 5786 if (msg->msg_controllen != 0) { 5787 mutex_exit(&connp->conn_lock); 5788 ASSERT(ixa == NULL); 5789 error = icmp_output_ancillary(connp, sin, NULL, mp, 5790 NULL, msg, cr, pid); 5791 } else if (conn_same_as_last_v4(connp, sin) && 5792 ipsec_outbound_policy_current(ixa)) { 5793 /* icmp_output_lastdst drops conn_lock */ 5794 error = icmp_output_lastdst(connp, mp, cr, pid, ixa); 5795 } else { 5796 /* icmp_output_newdst drops conn_lock */ 5797 error = icmp_output_newdst(connp, mp, sin, NULL, cr, 5798 pid, ixa); 5799 } 5800 ASSERT(MUTEX_NOT_HELD(&connp->conn_lock)); 5801 if (is->is_sendto_ignerr) 5802 return (0); 5803 else 5804 return (error); 5805 default: 5806 return (EINVAL); 5807 } 5808 } 5809 5810 sock_downcalls_t sock_rawip_downcalls = { 5811 rawip_activate, 5812 rawip_accept, 5813 rawip_bind, 5814 rawip_listen, 5815 rawip_connect, 5816 rawip_getpeername, 5817 rawip_getsockname, 5818 rawip_getsockopt, 5819 rawip_setsockopt, 5820 rawip_send, 5821 NULL, 5822 NULL, 5823 NULL, 5824 rawip_shutdown, 5825 rawip_clr_flowctrl, 5826 rawip_ioctl, 5827 rawip_close 5828 }; 5829