1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright (c) 1991, 2010, Oracle and/or its affiliates. All rights reserved. 23 */ 24 /* Copyright (c) 1990 Mentat Inc. */ 25 26 #include <sys/types.h> 27 #include <sys/stream.h> 28 #include <sys/stropts.h> 29 #include <sys/strlog.h> 30 #include <sys/strsun.h> 31 #define _SUN_TPI_VERSION 2 32 #include <sys/tihdr.h> 33 #include <sys/timod.h> 34 #include <sys/ddi.h> 35 #include <sys/sunddi.h> 36 #include <sys/strsubr.h> 37 #include <sys/suntpi.h> 38 #include <sys/xti_inet.h> 39 #include <sys/cmn_err.h> 40 #include <sys/kmem.h> 41 #include <sys/cred.h> 42 #include <sys/policy.h> 43 #include <sys/priv.h> 44 #include <sys/ucred.h> 45 #include <sys/zone.h> 46 47 #include <sys/sockio.h> 48 #include <sys/socket.h> 49 #include <sys/socketvar.h> 50 #include <sys/vtrace.h> 51 #include <sys/sdt.h> 52 #include <sys/debug.h> 53 #include <sys/isa_defs.h> 54 #include <sys/random.h> 55 #include <netinet/in.h> 56 #include <netinet/ip6.h> 57 #include <netinet/icmp6.h> 58 #include <netinet/udp.h> 59 60 #include <inet/common.h> 61 #include <inet/ip.h> 62 #include <inet/ip_impl.h> 63 #include <inet/ipsec_impl.h> 64 #include <inet/ip6.h> 65 #include <inet/ip_ire.h> 66 #include <inet/ip_if.h> 67 #include <inet/ip_multi.h> 68 #include <inet/ip_ndp.h> 69 #include <inet/proto_set.h> 70 #include <inet/mib2.h> 71 #include <inet/nd.h> 72 #include <inet/optcom.h> 73 #include <inet/snmpcom.h> 74 #include <inet/kstatcom.h> 75 #include <inet/ipclassifier.h> 76 77 #include <sys/tsol/label.h> 78 #include <sys/tsol/tnet.h> 79 80 #include <inet/rawip_impl.h> 81 82 #include <sys/disp.h> 83 84 /* 85 * Synchronization notes: 86 * 87 * RAWIP is MT and uses the usual kernel synchronization primitives. We use 88 * conn_lock to protect the icmp_t. 89 * 90 * Plumbing notes: 91 * ICMP is always a device driver. For compatibility with mibopen() code 92 * it is possible to I_PUSH "icmp", but that results in pushing a passthrough 93 * dummy module. 94 */ 95 96 static void icmp_addr_req(queue_t *q, mblk_t *mp); 97 static void icmp_tpi_bind(queue_t *q, mblk_t *mp); 98 static void icmp_bind_proto(icmp_t *icmp); 99 static int icmp_build_hdr_template(conn_t *, const in6_addr_t *, 100 const in6_addr_t *, uint32_t); 101 static void icmp_capability_req(queue_t *q, mblk_t *mp); 102 static int icmp_close(queue_t *q, int flags); 103 static void icmp_close_free(conn_t *); 104 static void icmp_tpi_connect(queue_t *q, mblk_t *mp); 105 static void icmp_tpi_disconnect(queue_t *q, mblk_t *mp); 106 static void icmp_err_ack(queue_t *q, mblk_t *mp, t_scalar_t t_error, 107 int sys_error); 108 static void icmp_err_ack_prim(queue_t *q, mblk_t *mp, t_scalar_t primitive, 109 t_scalar_t tlierr, int sys_error); 110 static void icmp_icmp_input(void *arg1, mblk_t *mp, void *arg2, 111 ip_recv_attr_t *); 112 static void icmp_icmp_error_ipv6(conn_t *connp, mblk_t *mp, 113 ip_recv_attr_t *); 114 static void icmp_info_req(queue_t *q, mblk_t *mp); 115 static void icmp_input(void *, mblk_t *, void *, ip_recv_attr_t *); 116 static conn_t *icmp_open(int family, cred_t *credp, int *err, int flags); 117 static int icmp_openv4(queue_t *q, dev_t *devp, int flag, int sflag, 118 cred_t *credp); 119 static int icmp_openv6(queue_t *q, dev_t *devp, int flag, int sflag, 120 cred_t *credp); 121 static boolean_t icmp_opt_allow_udr_set(t_scalar_t level, t_scalar_t name); 122 int icmp_opt_set(conn_t *connp, uint_t optset_context, 123 int level, int name, uint_t inlen, 124 uchar_t *invalp, uint_t *outlenp, uchar_t *outvalp, 125 void *thisdg_attrs, cred_t *cr); 126 int icmp_opt_get(conn_t *connp, int level, int name, 127 uchar_t *ptr); 128 static int icmp_output_newdst(conn_t *connp, mblk_t *data_mp, sin_t *sin, 129 sin6_t *sin6, cred_t *cr, pid_t pid, ip_xmit_attr_t *ixa); 130 static mblk_t *icmp_prepend_hdr(conn_t *, ip_xmit_attr_t *, const ip_pkt_t *, 131 const in6_addr_t *, const in6_addr_t *, uint32_t, mblk_t *, int *); 132 static mblk_t *icmp_prepend_header_template(conn_t *, ip_xmit_attr_t *, 133 mblk_t *, const in6_addr_t *, uint32_t, int *); 134 static int icmp_snmp_set(queue_t *q, t_scalar_t level, t_scalar_t name, 135 uchar_t *ptr, int len); 136 static void icmp_ud_err(queue_t *q, mblk_t *mp, t_scalar_t err); 137 static void icmp_tpi_unbind(queue_t *q, mblk_t *mp); 138 static void icmp_wput(queue_t *q, mblk_t *mp); 139 static void icmp_wput_fallback(queue_t *q, mblk_t *mp); 140 static void icmp_wput_other(queue_t *q, mblk_t *mp); 141 static void icmp_wput_iocdata(queue_t *q, mblk_t *mp); 142 static void icmp_wput_restricted(queue_t *q, mblk_t *mp); 143 static void icmp_ulp_recv(conn_t *, mblk_t *, uint_t); 144 145 static void *rawip_stack_init(netstackid_t stackid, netstack_t *ns); 146 static void rawip_stack_fini(netstackid_t stackid, void *arg); 147 148 static void *rawip_kstat_init(netstackid_t stackid); 149 static void rawip_kstat_fini(netstackid_t stackid, kstat_t *ksp); 150 static int rawip_kstat_update(kstat_t *kp, int rw); 151 static void rawip_stack_shutdown(netstackid_t stackid, void *arg); 152 153 /* Common routines for TPI and socket module */ 154 static conn_t *rawip_do_open(int, cred_t *, int *, int); 155 static void rawip_do_close(conn_t *); 156 static int rawip_do_bind(conn_t *, struct sockaddr *, socklen_t); 157 static int rawip_do_unbind(conn_t *); 158 static int rawip_do_connect(conn_t *, const struct sockaddr *, socklen_t, 159 cred_t *, pid_t); 160 161 int rawip_getsockname(sock_lower_handle_t, struct sockaddr *, 162 socklen_t *, cred_t *); 163 int rawip_getpeername(sock_lower_handle_t, struct sockaddr *, 164 socklen_t *, cred_t *); 165 166 static struct module_info icmp_mod_info = { 167 5707, "icmp", 1, INFPSZ, 512, 128 168 }; 169 170 /* 171 * Entry points for ICMP as a device. 172 * We have separate open functions for the /dev/icmp and /dev/icmp6 devices. 173 */ 174 static struct qinit icmprinitv4 = { 175 NULL, NULL, icmp_openv4, icmp_close, NULL, &icmp_mod_info 176 }; 177 178 static struct qinit icmprinitv6 = { 179 NULL, NULL, icmp_openv6, icmp_close, NULL, &icmp_mod_info 180 }; 181 182 static struct qinit icmpwinit = { 183 (pfi_t)icmp_wput, (pfi_t)ip_wsrv, NULL, NULL, NULL, &icmp_mod_info 184 }; 185 186 /* ICMP entry point during fallback */ 187 static struct qinit icmp_fallback_sock_winit = { 188 (pfi_t)icmp_wput_fallback, NULL, NULL, NULL, NULL, &icmp_mod_info 189 }; 190 191 /* For AF_INET aka /dev/icmp */ 192 struct streamtab icmpinfov4 = { 193 &icmprinitv4, &icmpwinit 194 }; 195 196 /* For AF_INET6 aka /dev/icmp6 */ 197 struct streamtab icmpinfov6 = { 198 &icmprinitv6, &icmpwinit 199 }; 200 201 /* Default structure copied into T_INFO_ACK messages */ 202 static struct T_info_ack icmp_g_t_info_ack = { 203 T_INFO_ACK, 204 IP_MAXPACKET, /* TSDU_size. icmp allows maximum size messages. */ 205 T_INVALID, /* ETSDU_size. icmp does not support expedited data. */ 206 T_INVALID, /* CDATA_size. icmp does not support connect data. */ 207 T_INVALID, /* DDATA_size. icmp does not support disconnect data. */ 208 0, /* ADDR_size - filled in later. */ 209 0, /* OPT_size - not initialized here */ 210 IP_MAXPACKET, /* TIDU_size. icmp allows maximum size messages. */ 211 T_CLTS, /* SERV_type. icmp supports connection-less. */ 212 TS_UNBND, /* CURRENT_state. This is set from icmp_state. */ 213 (XPG4_1|SENDZERO) /* PROVIDER_flag */ 214 }; 215 216 /* 217 * All of these are alterable, within the min/max values given, at run time. 218 * 219 * Note: All those tunables which do not start with "icmp_" are Committed and 220 * therefore are public. See PSARC 2009/306. 221 */ 222 static mod_prop_info_t icmp_propinfo_tbl[] = { 223 /* tunable - 0 */ 224 { "icmp_wroff_extra", MOD_PROTO_RAWIP, 225 mod_set_uint32, mod_get_uint32, 226 {0, 128, 32}, {32} }, 227 228 { "icmp_ipv4_ttl", MOD_PROTO_RAWIP, 229 mod_set_uint32, mod_get_uint32, 230 {1, 255, 255}, {255} }, 231 232 { "icmp_ipv6_hoplimit", MOD_PROTO_RAWIP, 233 mod_set_uint32, mod_get_uint32, 234 {0, IPV6_MAX_HOPS, IPV6_DEFAULT_HOPS}, 235 {IPV6_DEFAULT_HOPS} }, 236 237 { "icmp_bsd_compat", MOD_PROTO_RAWIP, 238 mod_set_boolean, mod_get_boolean, 239 {B_TRUE}, {B_TRUE} }, 240 241 { "send_maxbuf", MOD_PROTO_RAWIP, 242 mod_set_uint32, mod_get_uint32, 243 {4096, 65536, 8192}, {8192} }, 244 245 { "icmp_xmit_lowat", MOD_PROTO_RAWIP, 246 mod_set_uint32, mod_get_uint32, 247 {0, 65536, 1024}, {1024} }, 248 249 { "recv_maxbuf", MOD_PROTO_RAWIP, 250 mod_set_uint32, mod_get_uint32, 251 {4096, 65536, 8192}, {8192} }, 252 253 { "icmp_max_buf", MOD_PROTO_RAWIP, 254 mod_set_uint32, mod_get_uint32, 255 {65536, 1024*1024*1024, 256*1024}, {256 * 1024} }, 256 257 { "icmp_pmtu_discovery", MOD_PROTO_RAWIP, 258 mod_set_boolean, mod_get_boolean, 259 {B_FALSE}, {B_FALSE} }, 260 261 { "icmp_sendto_ignerr", MOD_PROTO_RAWIP, 262 mod_set_boolean, mod_get_boolean, 263 {B_FALSE}, {B_FALSE} }, 264 265 { "?", MOD_PROTO_RAWIP, NULL, mod_get_allprop, {0}, {0} }, 266 267 { NULL, 0, NULL, NULL, {0}, {0} } 268 }; 269 270 #define is_wroff_extra is_propinfo_tbl[0].prop_cur_uval 271 #define is_ipv4_ttl is_propinfo_tbl[1].prop_cur_uval 272 #define is_ipv6_hoplimit is_propinfo_tbl[2].prop_cur_uval 273 #define is_bsd_compat is_propinfo_tbl[3].prop_cur_bval 274 #define is_xmit_hiwat is_propinfo_tbl[4].prop_cur_uval 275 #define is_xmit_lowat is_propinfo_tbl[5].prop_cur_uval 276 #define is_recv_hiwat is_propinfo_tbl[6].prop_cur_uval 277 #define is_max_buf is_propinfo_tbl[7].prop_cur_uval 278 #define is_pmtu_discovery is_propinfo_tbl[8].prop_cur_bval 279 #define is_sendto_ignerr is_propinfo_tbl[9].prop_cur_bval 280 281 typedef union T_primitives *t_primp_t; 282 283 /* 284 * This routine is called to handle each O_T_BIND_REQ/T_BIND_REQ message 285 * passed to icmp_wput. 286 * It calls IP to verify the local IP address, and calls IP to insert 287 * the conn_t in the fanout table. 288 * If everything is ok it then sends the T_BIND_ACK back up. 289 */ 290 static void 291 icmp_tpi_bind(queue_t *q, mblk_t *mp) 292 { 293 int error; 294 struct sockaddr *sa; 295 struct T_bind_req *tbr; 296 socklen_t len; 297 sin_t *sin; 298 sin6_t *sin6; 299 icmp_t *icmp; 300 conn_t *connp = Q_TO_CONN(q); 301 mblk_t *mp1; 302 cred_t *cr; 303 304 /* 305 * All Solaris components should pass a db_credp 306 * for this TPI message, hence we ASSERT. 307 * But in case there is some other M_PROTO that looks 308 * like a TPI message sent by some other kernel 309 * component, we check and return an error. 310 */ 311 cr = msg_getcred(mp, NULL); 312 ASSERT(cr != NULL); 313 if (cr == NULL) { 314 icmp_err_ack(q, mp, TSYSERR, EINVAL); 315 return; 316 } 317 318 icmp = connp->conn_icmp; 319 if ((mp->b_wptr - mp->b_rptr) < sizeof (*tbr)) { 320 (void) mi_strlog(q, 1, SL_ERROR|SL_TRACE, 321 "icmp_bind: bad req, len %u", 322 (uint_t)(mp->b_wptr - mp->b_rptr)); 323 icmp_err_ack(q, mp, TPROTO, 0); 324 return; 325 } 326 327 if (icmp->icmp_state != TS_UNBND) { 328 (void) mi_strlog(q, 1, SL_ERROR|SL_TRACE, 329 "icmp_bind: bad state, %u", icmp->icmp_state); 330 icmp_err_ack(q, mp, TOUTSTATE, 0); 331 return; 332 } 333 334 /* 335 * Reallocate the message to make sure we have enough room for an 336 * address. 337 */ 338 mp1 = reallocb(mp, sizeof (struct T_bind_ack) + sizeof (sin6_t), 1); 339 if (mp1 == NULL) { 340 icmp_err_ack(q, mp, TSYSERR, ENOMEM); 341 return; 342 } 343 mp = mp1; 344 345 /* Reset the message type in preparation for shipping it back. */ 346 DB_TYPE(mp) = M_PCPROTO; 347 tbr = (struct T_bind_req *)mp->b_rptr; 348 len = tbr->ADDR_length; 349 switch (len) { 350 case 0: /* request for a generic port */ 351 tbr->ADDR_offset = sizeof (struct T_bind_req); 352 if (connp->conn_family == AF_INET) { 353 tbr->ADDR_length = sizeof (sin_t); 354 sin = (sin_t *)&tbr[1]; 355 *sin = sin_null; 356 sin->sin_family = AF_INET; 357 mp->b_wptr = (uchar_t *)&sin[1]; 358 sa = (struct sockaddr *)sin; 359 len = sizeof (sin_t); 360 } else { 361 ASSERT(connp->conn_family == AF_INET6); 362 tbr->ADDR_length = sizeof (sin6_t); 363 sin6 = (sin6_t *)&tbr[1]; 364 *sin6 = sin6_null; 365 sin6->sin6_family = AF_INET6; 366 mp->b_wptr = (uchar_t *)&sin6[1]; 367 sa = (struct sockaddr *)sin6; 368 len = sizeof (sin6_t); 369 } 370 break; 371 372 case sizeof (sin_t): /* Complete IPv4 address */ 373 sa = (struct sockaddr *)mi_offset_param(mp, tbr->ADDR_offset, 374 sizeof (sin_t)); 375 break; 376 377 case sizeof (sin6_t): /* Complete IPv6 address */ 378 sa = (struct sockaddr *)mi_offset_param(mp, 379 tbr->ADDR_offset, sizeof (sin6_t)); 380 break; 381 382 default: 383 (void) mi_strlog(q, 1, SL_ERROR|SL_TRACE, 384 "icmp_bind: bad ADDR_length %u", tbr->ADDR_length); 385 icmp_err_ack(q, mp, TBADADDR, 0); 386 return; 387 } 388 389 error = rawip_do_bind(connp, sa, len); 390 if (error != 0) { 391 if (error > 0) { 392 icmp_err_ack(q, mp, TSYSERR, error); 393 } else { 394 icmp_err_ack(q, mp, -error, 0); 395 } 396 } else { 397 tbr->PRIM_type = T_BIND_ACK; 398 qreply(q, mp); 399 } 400 } 401 402 static int 403 rawip_do_bind(conn_t *connp, struct sockaddr *sa, socklen_t len) 404 { 405 sin_t *sin; 406 sin6_t *sin6; 407 icmp_t *icmp = connp->conn_icmp; 408 int error = 0; 409 ip_laddr_t laddr_type = IPVL_UNICAST_UP; /* INADDR_ANY */ 410 in_port_t lport; /* Network byte order */ 411 ipaddr_t v4src; /* Set if AF_INET */ 412 in6_addr_t v6src; 413 uint_t scopeid = 0; 414 zoneid_t zoneid = IPCL_ZONEID(connp); 415 ip_stack_t *ipst = connp->conn_netstack->netstack_ip; 416 417 if (sa == NULL || !OK_32PTR((char *)sa)) { 418 return (EINVAL); 419 } 420 421 switch (len) { 422 case sizeof (sin_t): /* Complete IPv4 address */ 423 sin = (sin_t *)sa; 424 if (sin->sin_family != AF_INET || 425 connp->conn_family != AF_INET) { 426 /* TSYSERR, EAFNOSUPPORT */ 427 return (EAFNOSUPPORT); 428 } 429 v4src = sin->sin_addr.s_addr; 430 IN6_IPADDR_TO_V4MAPPED(v4src, &v6src); 431 if (v4src != INADDR_ANY) { 432 laddr_type = ip_laddr_verify_v4(v4src, zoneid, ipst, 433 B_TRUE); 434 } 435 lport = sin->sin_port; 436 break; 437 case sizeof (sin6_t): /* Complete IPv6 address */ 438 sin6 = (sin6_t *)sa; 439 if (sin6->sin6_family != AF_INET6 || 440 connp->conn_family != AF_INET6) { 441 /* TSYSERR, EAFNOSUPPORT */ 442 return (EAFNOSUPPORT); 443 } 444 /* No support for mapped addresses on raw sockets */ 445 if (IN6_IS_ADDR_V4MAPPED(&sin6->sin6_addr)) { 446 /* TSYSERR, EADDRNOTAVAIL */ 447 return (EADDRNOTAVAIL); 448 } 449 v6src = sin6->sin6_addr; 450 if (!IN6_IS_ADDR_UNSPECIFIED(&v6src)) { 451 if (IN6_IS_ADDR_LINKSCOPE(&v6src)) 452 scopeid = sin6->sin6_scope_id; 453 laddr_type = ip_laddr_verify_v6(&v6src, zoneid, ipst, 454 B_TRUE, scopeid); 455 } 456 lport = sin6->sin6_port; 457 break; 458 459 default: 460 /* TBADADDR */ 461 return (EADDRNOTAVAIL); 462 } 463 464 /* Is the local address a valid unicast, multicast, or broadcast? */ 465 if (laddr_type == IPVL_BAD) 466 return (EADDRNOTAVAIL); 467 468 /* 469 * The state must be TS_UNBND. 470 */ 471 mutex_enter(&connp->conn_lock); 472 if (icmp->icmp_state != TS_UNBND) { 473 mutex_exit(&connp->conn_lock); 474 return (-TOUTSTATE); 475 } 476 477 /* 478 * Copy the source address into our icmp structure. This address 479 * may still be zero; if so, ip will fill in the correct address 480 * each time an outbound packet is passed to it. 481 * If we are binding to a broadcast or multicast address then 482 * we just set the conn_bound_addr since we don't want to use 483 * that as the source address when sending. 484 */ 485 connp->conn_bound_addr_v6 = v6src; 486 connp->conn_laddr_v6 = v6src; 487 if (scopeid != 0) { 488 connp->conn_ixa->ixa_flags |= IXAF_SCOPEID_SET; 489 connp->conn_ixa->ixa_scopeid = scopeid; 490 connp->conn_incoming_ifindex = scopeid; 491 } else { 492 connp->conn_ixa->ixa_flags &= ~IXAF_SCOPEID_SET; 493 connp->conn_incoming_ifindex = connp->conn_bound_if; 494 } 495 496 switch (laddr_type) { 497 case IPVL_UNICAST_UP: 498 case IPVL_UNICAST_DOWN: 499 connp->conn_saddr_v6 = v6src; 500 connp->conn_mcbc_bind = B_FALSE; 501 break; 502 case IPVL_MCAST: 503 case IPVL_BCAST: 504 /* ip_set_destination will pick a source address later */ 505 connp->conn_saddr_v6 = ipv6_all_zeros; 506 connp->conn_mcbc_bind = B_TRUE; 507 break; 508 } 509 510 /* Any errors after this point should use late_error */ 511 512 /* 513 * Use sin_port/sin6_port since applications like psh use SOCK_RAW 514 * with IPPROTO_TCP. 515 */ 516 connp->conn_lport = lport; 517 connp->conn_fport = 0; 518 519 if (connp->conn_family == AF_INET) { 520 ASSERT(connp->conn_ipversion == IPV4_VERSION); 521 } else { 522 ASSERT(connp->conn_ipversion == IPV6_VERSION); 523 } 524 525 icmp->icmp_state = TS_IDLE; 526 527 /* 528 * We create an initial header template here to make a subsequent 529 * sendto have a starting point. Since conn_last_dst is zero the 530 * first sendto will always follow the 'dst changed' code path. 531 * Note that we defer massaging options and the related checksum 532 * adjustment until we have a destination address. 533 */ 534 error = icmp_build_hdr_template(connp, &connp->conn_saddr_v6, 535 &connp->conn_faddr_v6, connp->conn_flowinfo); 536 if (error != 0) { 537 mutex_exit(&connp->conn_lock); 538 goto late_error; 539 } 540 /* Just in case */ 541 connp->conn_faddr_v6 = ipv6_all_zeros; 542 connp->conn_v6lastdst = ipv6_all_zeros; 543 mutex_exit(&connp->conn_lock); 544 545 error = ip_laddr_fanout_insert(connp); 546 if (error != 0) 547 goto late_error; 548 549 /* Bind succeeded */ 550 return (0); 551 552 late_error: 553 mutex_enter(&connp->conn_lock); 554 connp->conn_saddr_v6 = ipv6_all_zeros; 555 connp->conn_bound_addr_v6 = ipv6_all_zeros; 556 connp->conn_laddr_v6 = ipv6_all_zeros; 557 if (scopeid != 0) { 558 connp->conn_ixa->ixa_flags &= ~IXAF_SCOPEID_SET; 559 connp->conn_incoming_ifindex = connp->conn_bound_if; 560 } 561 icmp->icmp_state = TS_UNBND; 562 connp->conn_v6lastdst = ipv6_all_zeros; 563 connp->conn_lport = 0; 564 565 /* Restore the header that was built above - different source address */ 566 (void) icmp_build_hdr_template(connp, &connp->conn_saddr_v6, 567 &connp->conn_faddr_v6, connp->conn_flowinfo); 568 mutex_exit(&connp->conn_lock); 569 return (error); 570 } 571 572 /* 573 * Tell IP to just bind to the protocol. 574 */ 575 static void 576 icmp_bind_proto(icmp_t *icmp) 577 { 578 conn_t *connp = icmp->icmp_connp; 579 580 mutex_enter(&connp->conn_lock); 581 connp->conn_saddr_v6 = ipv6_all_zeros; 582 connp->conn_laddr_v6 = ipv6_all_zeros; 583 connp->conn_faddr_v6 = ipv6_all_zeros; 584 connp->conn_v6lastdst = ipv6_all_zeros; 585 mutex_exit(&connp->conn_lock); 586 587 (void) ip_laddr_fanout_insert(connp); 588 } 589 590 /* 591 * This routine handles each T_CONN_REQ message passed to icmp. It 592 * associates a default destination address with the stream. 593 * 594 * After various error checks are completed, icmp_connect() lays 595 * the target address and port into the composite header template. 596 * Then we ask IP for information, including a source address if we didn't 597 * already have one. Finally we send up the T_OK_ACK reply message. 598 */ 599 static void 600 icmp_tpi_connect(queue_t *q, mblk_t *mp) 601 { 602 conn_t *connp = Q_TO_CONN(q); 603 struct T_conn_req *tcr; 604 struct sockaddr *sa; 605 socklen_t len; 606 int error; 607 cred_t *cr; 608 pid_t pid; 609 /* 610 * All Solaris components should pass a db_credp 611 * for this TPI message, hence we ASSERT. 612 * But in case there is some other M_PROTO that looks 613 * like a TPI message sent by some other kernel 614 * component, we check and return an error. 615 */ 616 cr = msg_getcred(mp, &pid); 617 ASSERT(cr != NULL); 618 if (cr == NULL) { 619 icmp_err_ack(q, mp, TSYSERR, EINVAL); 620 return; 621 } 622 623 tcr = (struct T_conn_req *)mp->b_rptr; 624 /* Sanity checks */ 625 if ((mp->b_wptr - mp->b_rptr) < sizeof (struct T_conn_req)) { 626 icmp_err_ack(q, mp, TPROTO, 0); 627 return; 628 } 629 630 if (tcr->OPT_length != 0) { 631 icmp_err_ack(q, mp, TBADOPT, 0); 632 return; 633 } 634 635 len = tcr->DEST_length; 636 637 switch (len) { 638 default: 639 icmp_err_ack(q, mp, TBADADDR, 0); 640 return; 641 case sizeof (sin_t): 642 sa = (struct sockaddr *)mi_offset_param(mp, tcr->DEST_offset, 643 sizeof (sin_t)); 644 break; 645 case sizeof (sin6_t): 646 sa = (struct sockaddr *)mi_offset_param(mp, 647 tcr->DEST_offset, sizeof (sin6_t)); 648 break; 649 } 650 651 error = proto_verify_ip_addr(connp->conn_family, sa, len); 652 if (error != 0) { 653 icmp_err_ack(q, mp, TSYSERR, error); 654 return; 655 } 656 657 error = rawip_do_connect(connp, sa, len, cr, pid); 658 if (error != 0) { 659 if (error < 0) { 660 icmp_err_ack(q, mp, -error, 0); 661 } else { 662 icmp_err_ack(q, mp, 0, error); 663 } 664 } else { 665 mblk_t *mp1; 666 667 /* 668 * We have to send a connection confirmation to 669 * keep TLI happy. 670 */ 671 if (connp->conn_family == AF_INET) { 672 mp1 = mi_tpi_conn_con(NULL, (char *)sa, 673 sizeof (sin_t), NULL, 0); 674 } else { 675 ASSERT(connp->conn_family == AF_INET6); 676 mp1 = mi_tpi_conn_con(NULL, (char *)sa, 677 sizeof (sin6_t), NULL, 0); 678 } 679 if (mp1 == NULL) { 680 icmp_err_ack(q, mp, TSYSERR, ENOMEM); 681 return; 682 } 683 684 /* 685 * Send ok_ack for T_CONN_REQ 686 */ 687 mp = mi_tpi_ok_ack_alloc(mp); 688 if (mp == NULL) { 689 /* Unable to reuse the T_CONN_REQ for the ack. */ 690 icmp_err_ack_prim(q, mp1, T_CONN_REQ, TSYSERR, ENOMEM); 691 return; 692 } 693 putnext(connp->conn_rq, mp); 694 putnext(connp->conn_rq, mp1); 695 } 696 } 697 698 static int 699 rawip_do_connect(conn_t *connp, const struct sockaddr *sa, socklen_t len, 700 cred_t *cr, pid_t pid) 701 { 702 icmp_t *icmp; 703 sin_t *sin; 704 sin6_t *sin6; 705 int error; 706 uint16_t dstport; 707 ipaddr_t v4dst; 708 in6_addr_t v6dst; 709 uint32_t flowinfo; 710 ip_xmit_attr_t *ixa; 711 ip_xmit_attr_t *oldixa; 712 uint_t scopeid = 0; 713 uint_t srcid = 0; 714 in6_addr_t v6src = connp->conn_saddr_v6; 715 716 icmp = connp->conn_icmp; 717 718 if (sa == NULL || !OK_32PTR((char *)sa)) { 719 return (EINVAL); 720 } 721 722 ASSERT(sa != NULL && len != 0); 723 724 /* 725 * Determine packet type based on type of address passed in 726 * the request should contain an IPv4 or IPv6 address. 727 * Make sure that address family matches the type of 728 * family of the address passed down. 729 */ 730 switch (len) { 731 case sizeof (sin_t): 732 sin = (sin_t *)sa; 733 734 v4dst = sin->sin_addr.s_addr; 735 dstport = sin->sin_port; 736 IN6_IPADDR_TO_V4MAPPED(v4dst, &v6dst); 737 ASSERT(connp->conn_ipversion == IPV4_VERSION); 738 break; 739 740 case sizeof (sin6_t): 741 sin6 = (sin6_t *)sa; 742 743 /* No support for mapped addresses on raw sockets */ 744 if (IN6_IS_ADDR_V4MAPPED(&sin6->sin6_addr)) { 745 return (EADDRNOTAVAIL); 746 } 747 v6dst = sin6->sin6_addr; 748 dstport = sin6->sin6_port; 749 ASSERT(connp->conn_ipversion == IPV6_VERSION); 750 flowinfo = sin6->sin6_flowinfo; 751 if (IN6_IS_ADDR_LINKLOCAL(&sin6->sin6_addr)) 752 scopeid = sin6->sin6_scope_id; 753 srcid = sin6->__sin6_src_id; 754 if (srcid != 0 && IN6_IS_ADDR_UNSPECIFIED(&v6src)) { 755 ip_srcid_find_id(srcid, &v6src, IPCL_ZONEID(connp), 756 connp->conn_netstack); 757 } 758 break; 759 } 760 761 /* 762 * If there is a different thread using conn_ixa then we get a new 763 * copy and cut the old one loose from conn_ixa. Otherwise we use 764 * conn_ixa and prevent any other thread from using/changing it. 765 * Once connect() is done other threads can use conn_ixa since the 766 * refcnt will be back at one. 767 * We defer updating conn_ixa until later to handle any concurrent 768 * conn_ixa_cleanup thread. 769 */ 770 ixa = conn_get_ixa(connp, B_FALSE); 771 if (ixa == NULL) 772 return (ENOMEM); 773 774 ASSERT(ixa->ixa_refcnt >= 2); 775 ASSERT(ixa == connp->conn_ixa); 776 777 mutex_enter(&connp->conn_lock); 778 /* 779 * This icmp_t must have bound already before doing a connect. 780 * Reject if a connect is in progress (we drop conn_lock during 781 * rawip_do_connect). 782 */ 783 if (icmp->icmp_state == TS_UNBND || icmp->icmp_state == TS_WCON_CREQ) { 784 mutex_exit(&connp->conn_lock); 785 ixa_refrele(ixa); 786 return (-TOUTSTATE); 787 } 788 789 if (icmp->icmp_state == TS_DATA_XFER) { 790 /* Already connected - clear out state */ 791 if (connp->conn_mcbc_bind) 792 connp->conn_saddr_v6 = ipv6_all_zeros; 793 else 794 connp->conn_saddr_v6 = connp->conn_bound_addr_v6; 795 connp->conn_laddr_v6 = connp->conn_bound_addr_v6; 796 connp->conn_faddr_v6 = ipv6_all_zeros; 797 icmp->icmp_state = TS_IDLE; 798 } 799 800 /* 801 * Use sin_port/sin6_port since applications like psh use SOCK_RAW 802 * with IPPROTO_TCP. 803 */ 804 connp->conn_fport = dstport; 805 if (connp->conn_ipversion == IPV4_VERSION) { 806 /* 807 * Interpret a zero destination to mean loopback. 808 * Update the T_CONN_REQ (sin/sin6) since it is used to 809 * generate the T_CONN_CON. 810 */ 811 if (v4dst == INADDR_ANY) { 812 v4dst = htonl(INADDR_LOOPBACK); 813 IN6_IPADDR_TO_V4MAPPED(v4dst, &v6dst); 814 ASSERT(connp->conn_family == AF_INET); 815 sin->sin_addr.s_addr = v4dst; 816 } 817 connp->conn_faddr_v6 = v6dst; 818 connp->conn_flowinfo = 0; 819 } else { 820 ASSERT(connp->conn_ipversion == IPV6_VERSION); 821 /* 822 * Interpret a zero destination to mean loopback. 823 * Update the T_CONN_REQ (sin/sin6) since it is used to 824 * generate the T_CONN_CON. 825 */ 826 if (IN6_IS_ADDR_UNSPECIFIED(&v6dst)) { 827 v6dst = ipv6_loopback; 828 sin6->sin6_addr = v6dst; 829 } 830 connp->conn_faddr_v6 = v6dst; 831 connp->conn_flowinfo = flowinfo; 832 } 833 834 /* 835 * We update our cred/cpid based on the caller of connect 836 */ 837 if (connp->conn_cred != cr) { 838 crhold(cr); 839 crfree(connp->conn_cred); 840 connp->conn_cred = cr; 841 } 842 connp->conn_cpid = pid; 843 ASSERT(!(ixa->ixa_free_flags & IXA_FREE_CRED)); 844 ixa->ixa_cred = cr; 845 ixa->ixa_cpid = pid; 846 if (is_system_labeled()) { 847 /* We need to restart with a label based on the cred */ 848 ip_xmit_attr_restore_tsl(ixa, ixa->ixa_cred); 849 } 850 851 if (scopeid != 0) { 852 ixa->ixa_flags |= IXAF_SCOPEID_SET; 853 ixa->ixa_scopeid = scopeid; 854 connp->conn_incoming_ifindex = scopeid; 855 } else { 856 ixa->ixa_flags &= ~IXAF_SCOPEID_SET; 857 connp->conn_incoming_ifindex = connp->conn_bound_if; 858 } 859 860 /* 861 * conn_connect will drop conn_lock and reacquire it. 862 * To prevent a send* from messing with this icmp_t while the lock 863 * is dropped we set icmp_state and clear conn_v6lastdst. 864 * That will make all send* fail with EISCONN. 865 */ 866 connp->conn_v6lastdst = ipv6_all_zeros; 867 icmp->icmp_state = TS_WCON_CREQ; 868 869 error = conn_connect(connp, NULL, IPDF_ALLOW_MCBC); 870 mutex_exit(&connp->conn_lock); 871 if (error != 0) 872 goto connect_failed; 873 874 /* 875 * The addresses have been verified. Time to insert in 876 * the correct fanout list. 877 */ 878 error = ipcl_conn_insert(connp); 879 if (error != 0) 880 goto connect_failed; 881 882 mutex_enter(&connp->conn_lock); 883 error = icmp_build_hdr_template(connp, &connp->conn_saddr_v6, 884 &connp->conn_faddr_v6, connp->conn_flowinfo); 885 if (error != 0) { 886 mutex_exit(&connp->conn_lock); 887 goto connect_failed; 888 } 889 890 icmp->icmp_state = TS_DATA_XFER; 891 /* Record this as the "last" send even though we haven't sent any */ 892 connp->conn_v6lastdst = connp->conn_faddr_v6; 893 connp->conn_lastipversion = connp->conn_ipversion; 894 connp->conn_lastdstport = connp->conn_fport; 895 connp->conn_lastflowinfo = connp->conn_flowinfo; 896 connp->conn_lastscopeid = scopeid; 897 connp->conn_lastsrcid = srcid; 898 /* Also remember a source to use together with lastdst */ 899 connp->conn_v6lastsrc = v6src; 900 901 oldixa = conn_replace_ixa(connp, ixa); 902 mutex_exit(&connp->conn_lock); 903 ixa_refrele(oldixa); 904 905 ixa_refrele(ixa); 906 return (0); 907 908 connect_failed: 909 if (ixa != NULL) 910 ixa_refrele(ixa); 911 mutex_enter(&connp->conn_lock); 912 icmp->icmp_state = TS_IDLE; 913 /* In case the source address was set above */ 914 if (connp->conn_mcbc_bind) 915 connp->conn_saddr_v6 = ipv6_all_zeros; 916 else 917 connp->conn_saddr_v6 = connp->conn_bound_addr_v6; 918 connp->conn_laddr_v6 = connp->conn_bound_addr_v6; 919 connp->conn_faddr_v6 = ipv6_all_zeros; 920 connp->conn_v6lastdst = ipv6_all_zeros; 921 connp->conn_flowinfo = 0; 922 923 (void) icmp_build_hdr_template(connp, &connp->conn_saddr_v6, 924 &connp->conn_faddr_v6, connp->conn_flowinfo); 925 mutex_exit(&connp->conn_lock); 926 return (error); 927 } 928 929 static void 930 rawip_do_close(conn_t *connp) 931 { 932 ASSERT(connp != NULL && IPCL_IS_RAWIP(connp)); 933 934 ip_quiesce_conn(connp); 935 936 if (!IPCL_IS_NONSTR(connp)) { 937 qprocsoff(connp->conn_rq); 938 } 939 940 icmp_close_free(connp); 941 942 /* 943 * Now we are truly single threaded on this stream, and can 944 * delete the things hanging off the connp, and finally the connp. 945 * We removed this connp from the fanout list, it cannot be 946 * accessed thru the fanouts, and we already waited for the 947 * conn_ref to drop to 0. We are already in close, so 948 * there cannot be any other thread from the top. qprocsoff 949 * has completed, and service has completed or won't run in 950 * future. 951 */ 952 ASSERT(connp->conn_ref == 1); 953 954 if (!IPCL_IS_NONSTR(connp)) { 955 inet_minor_free(connp->conn_minor_arena, connp->conn_dev); 956 } else { 957 ip_free_helper_stream(connp); 958 } 959 960 connp->conn_ref--; 961 ipcl_conn_destroy(connp); 962 } 963 964 static int 965 icmp_close(queue_t *q, int flags) 966 { 967 conn_t *connp; 968 969 if (flags & SO_FALLBACK) { 970 /* 971 * stream is being closed while in fallback 972 * simply free the resources that were allocated 973 */ 974 inet_minor_free(WR(q)->q_ptr, (dev_t)(RD(q)->q_ptr)); 975 qprocsoff(q); 976 goto done; 977 } 978 979 connp = Q_TO_CONN(q); 980 (void) rawip_do_close(connp); 981 done: 982 q->q_ptr = WR(q)->q_ptr = NULL; 983 return (0); 984 } 985 986 static void 987 icmp_close_free(conn_t *connp) 988 { 989 icmp_t *icmp = connp->conn_icmp; 990 991 if (icmp->icmp_filter != NULL) { 992 kmem_free(icmp->icmp_filter, sizeof (icmp6_filter_t)); 993 icmp->icmp_filter = NULL; 994 } 995 996 /* 997 * Clear any fields which the kmem_cache constructor clears. 998 * Only icmp_connp needs to be preserved. 999 * TBD: We should make this more efficient to avoid clearing 1000 * everything. 1001 */ 1002 ASSERT(icmp->icmp_connp == connp); 1003 bzero(icmp, sizeof (icmp_t)); 1004 icmp->icmp_connp = connp; 1005 } 1006 1007 /* 1008 * This routine handles each T_DISCON_REQ message passed to icmp 1009 * as an indicating that ICMP is no longer connected. This results 1010 * in telling IP to restore the binding to just the local address. 1011 */ 1012 static int 1013 icmp_do_disconnect(conn_t *connp) 1014 { 1015 icmp_t *icmp = connp->conn_icmp; 1016 int error; 1017 1018 mutex_enter(&connp->conn_lock); 1019 if (icmp->icmp_state != TS_DATA_XFER) { 1020 mutex_exit(&connp->conn_lock); 1021 return (-TOUTSTATE); 1022 } 1023 if (connp->conn_mcbc_bind) 1024 connp->conn_saddr_v6 = ipv6_all_zeros; 1025 else 1026 connp->conn_saddr_v6 = connp->conn_bound_addr_v6; 1027 connp->conn_laddr_v6 = connp->conn_bound_addr_v6; 1028 connp->conn_faddr_v6 = ipv6_all_zeros; 1029 icmp->icmp_state = TS_IDLE; 1030 1031 connp->conn_v6lastdst = ipv6_all_zeros; 1032 error = icmp_build_hdr_template(connp, &connp->conn_saddr_v6, 1033 &connp->conn_faddr_v6, connp->conn_flowinfo); 1034 mutex_exit(&connp->conn_lock); 1035 if (error != 0) 1036 return (error); 1037 1038 /* 1039 * Tell IP to remove the full binding and revert 1040 * to the local address binding. 1041 */ 1042 return (ip_laddr_fanout_insert(connp)); 1043 } 1044 1045 static void 1046 icmp_tpi_disconnect(queue_t *q, mblk_t *mp) 1047 { 1048 conn_t *connp = Q_TO_CONN(q); 1049 int error; 1050 1051 /* 1052 * Allocate the largest primitive we need to send back 1053 * T_error_ack is > than T_ok_ack 1054 */ 1055 mp = reallocb(mp, sizeof (struct T_error_ack), 1); 1056 if (mp == NULL) { 1057 /* Unable to reuse the T_DISCON_REQ for the ack. */ 1058 icmp_err_ack_prim(q, mp, T_DISCON_REQ, TSYSERR, ENOMEM); 1059 return; 1060 } 1061 1062 error = icmp_do_disconnect(connp); 1063 1064 if (error != 0) { 1065 if (error > 0) { 1066 icmp_err_ack(q, mp, 0, error); 1067 } else { 1068 icmp_err_ack(q, mp, -error, 0); 1069 } 1070 } else { 1071 mp = mi_tpi_ok_ack_alloc(mp); 1072 ASSERT(mp != NULL); 1073 qreply(q, mp); 1074 } 1075 } 1076 1077 static int 1078 icmp_disconnect(conn_t *connp) 1079 { 1080 int error; 1081 1082 connp->conn_dgram_errind = B_FALSE; 1083 1084 error = icmp_do_disconnect(connp); 1085 1086 if (error < 0) 1087 error = proto_tlitosyserr(-error); 1088 return (error); 1089 } 1090 1091 /* This routine creates a T_ERROR_ACK message and passes it upstream. */ 1092 static void 1093 icmp_err_ack(queue_t *q, mblk_t *mp, t_scalar_t t_error, int sys_error) 1094 { 1095 if ((mp = mi_tpi_err_ack_alloc(mp, t_error, sys_error)) != NULL) 1096 qreply(q, mp); 1097 } 1098 1099 /* Shorthand to generate and send TPI error acks to our client */ 1100 static void 1101 icmp_err_ack_prim(queue_t *q, mblk_t *mp, t_scalar_t primitive, 1102 t_scalar_t t_error, int sys_error) 1103 { 1104 struct T_error_ack *teackp; 1105 1106 if ((mp = tpi_ack_alloc(mp, sizeof (struct T_error_ack), 1107 M_PCPROTO, T_ERROR_ACK)) != NULL) { 1108 teackp = (struct T_error_ack *)mp->b_rptr; 1109 teackp->ERROR_prim = primitive; 1110 teackp->TLI_error = t_error; 1111 teackp->UNIX_error = sys_error; 1112 qreply(q, mp); 1113 } 1114 } 1115 1116 /* 1117 * icmp_icmp_input is called as conn_recvicmp to process ICMP messages. 1118 * Generates the appropriate T_UDERROR_IND for permanent (non-transient) errors. 1119 * Assumes that IP has pulled up everything up to and including the ICMP header. 1120 */ 1121 /* ARGSUSED2 */ 1122 static void 1123 icmp_icmp_input(void *arg1, mblk_t *mp, void *arg2, ip_recv_attr_t *ira) 1124 { 1125 conn_t *connp = (conn_t *)arg1; 1126 icmp_t *icmp = connp->conn_icmp; 1127 icmph_t *icmph; 1128 ipha_t *ipha; 1129 int iph_hdr_length; 1130 sin_t sin; 1131 mblk_t *mp1; 1132 int error = 0; 1133 1134 ipha = (ipha_t *)mp->b_rptr; 1135 1136 ASSERT(OK_32PTR(mp->b_rptr)); 1137 1138 if (IPH_HDR_VERSION(ipha) != IPV4_VERSION) { 1139 ASSERT(IPH_HDR_VERSION(ipha) == IPV6_VERSION); 1140 icmp_icmp_error_ipv6(connp, mp, ira); 1141 return; 1142 } 1143 ASSERT(IPH_HDR_VERSION(ipha) == IPV4_VERSION); 1144 1145 /* Skip past the outer IP and ICMP headers */ 1146 ASSERT(IPH_HDR_LENGTH(ipha) == ira->ira_ip_hdr_length); 1147 iph_hdr_length = ira->ira_ip_hdr_length; 1148 icmph = (icmph_t *)&mp->b_rptr[iph_hdr_length]; 1149 ipha = (ipha_t *)&icmph[1]; /* Inner IP header */ 1150 1151 iph_hdr_length = IPH_HDR_LENGTH(ipha); 1152 1153 switch (icmph->icmph_type) { 1154 case ICMP_DEST_UNREACHABLE: 1155 switch (icmph->icmph_code) { 1156 case ICMP_FRAGMENTATION_NEEDED: { 1157 ipha_t *ipha; 1158 ip_xmit_attr_t *ixa; 1159 /* 1160 * IP has already adjusted the path MTU. 1161 * But we need to adjust DF for IPv4. 1162 */ 1163 if (connp->conn_ipversion != IPV4_VERSION) 1164 break; 1165 1166 ixa = conn_get_ixa(connp, B_FALSE); 1167 if (ixa == NULL || ixa->ixa_ire == NULL) { 1168 /* 1169 * Some other thread holds conn_ixa. We will 1170 * redo this on the next ICMP too big. 1171 */ 1172 if (ixa != NULL) 1173 ixa_refrele(ixa); 1174 break; 1175 } 1176 (void) ip_get_pmtu(ixa); 1177 1178 mutex_enter(&connp->conn_lock); 1179 ipha = (ipha_t *)connp->conn_ht_iphc; 1180 if (ixa->ixa_flags & IXAF_PMTU_IPV4_DF) { 1181 ipha->ipha_fragment_offset_and_flags |= 1182 IPH_DF_HTONS; 1183 } else { 1184 ipha->ipha_fragment_offset_and_flags &= 1185 ~IPH_DF_HTONS; 1186 } 1187 mutex_exit(&connp->conn_lock); 1188 ixa_refrele(ixa); 1189 break; 1190 } 1191 case ICMP_PORT_UNREACHABLE: 1192 case ICMP_PROTOCOL_UNREACHABLE: 1193 error = ECONNREFUSED; 1194 break; 1195 default: 1196 /* Transient errors */ 1197 break; 1198 } 1199 break; 1200 default: 1201 /* Transient errors */ 1202 break; 1203 } 1204 if (error == 0) { 1205 freemsg(mp); 1206 return; 1207 } 1208 1209 /* 1210 * Deliver T_UDERROR_IND when the application has asked for it. 1211 * The socket layer enables this automatically when connected. 1212 */ 1213 if (!connp->conn_dgram_errind) { 1214 freemsg(mp); 1215 return; 1216 } 1217 1218 sin = sin_null; 1219 sin.sin_family = AF_INET; 1220 sin.sin_addr.s_addr = ipha->ipha_dst; 1221 1222 if (IPCL_IS_NONSTR(connp)) { 1223 mutex_enter(&connp->conn_lock); 1224 if (icmp->icmp_state == TS_DATA_XFER) { 1225 if (sin.sin_addr.s_addr == connp->conn_faddr_v4) { 1226 mutex_exit(&connp->conn_lock); 1227 (*connp->conn_upcalls->su_set_error) 1228 (connp->conn_upper_handle, error); 1229 goto done; 1230 } 1231 } else { 1232 icmp->icmp_delayed_error = error; 1233 *((sin_t *)&icmp->icmp_delayed_addr) = sin; 1234 } 1235 mutex_exit(&connp->conn_lock); 1236 } else { 1237 mp1 = mi_tpi_uderror_ind((char *)&sin, sizeof (sin_t), NULL, 0, 1238 error); 1239 if (mp1 != NULL) 1240 putnext(connp->conn_rq, mp1); 1241 } 1242 done: 1243 freemsg(mp); 1244 } 1245 1246 /* 1247 * icmp_icmp_error_ipv6 is called by icmp_icmp_error to process ICMP for IPv6. 1248 * Generates the appropriate T_UDERROR_IND for permanent (non-transient) errors. 1249 * Assumes that IP has pulled up all the extension headers as well as the 1250 * ICMPv6 header. 1251 */ 1252 static void 1253 icmp_icmp_error_ipv6(conn_t *connp, mblk_t *mp, ip_recv_attr_t *ira) 1254 { 1255 icmp6_t *icmp6; 1256 ip6_t *ip6h, *outer_ip6h; 1257 uint16_t iph_hdr_length; 1258 uint8_t *nexthdrp; 1259 sin6_t sin6; 1260 mblk_t *mp1; 1261 int error = 0; 1262 icmp_t *icmp = connp->conn_icmp; 1263 1264 outer_ip6h = (ip6_t *)mp->b_rptr; 1265 #ifdef DEBUG 1266 if (outer_ip6h->ip6_nxt != IPPROTO_ICMPV6) 1267 iph_hdr_length = ip_hdr_length_v6(mp, outer_ip6h); 1268 else 1269 iph_hdr_length = IPV6_HDR_LEN; 1270 ASSERT(iph_hdr_length == ira->ira_ip_hdr_length); 1271 #endif 1272 /* Skip past the outer IP and ICMP headers */ 1273 iph_hdr_length = ira->ira_ip_hdr_length; 1274 icmp6 = (icmp6_t *)&mp->b_rptr[iph_hdr_length]; 1275 1276 ip6h = (ip6_t *)&icmp6[1]; /* Inner IP header */ 1277 if (!ip_hdr_length_nexthdr_v6(mp, ip6h, &iph_hdr_length, &nexthdrp)) { 1278 freemsg(mp); 1279 return; 1280 } 1281 1282 switch (icmp6->icmp6_type) { 1283 case ICMP6_DST_UNREACH: 1284 switch (icmp6->icmp6_code) { 1285 case ICMP6_DST_UNREACH_NOPORT: 1286 error = ECONNREFUSED; 1287 break; 1288 case ICMP6_DST_UNREACH_ADMIN: 1289 case ICMP6_DST_UNREACH_NOROUTE: 1290 case ICMP6_DST_UNREACH_BEYONDSCOPE: 1291 case ICMP6_DST_UNREACH_ADDR: 1292 /* Transient errors */ 1293 break; 1294 default: 1295 break; 1296 } 1297 break; 1298 case ICMP6_PACKET_TOO_BIG: { 1299 struct T_unitdata_ind *tudi; 1300 struct T_opthdr *toh; 1301 size_t udi_size; 1302 mblk_t *newmp; 1303 t_scalar_t opt_length = sizeof (struct T_opthdr) + 1304 sizeof (struct ip6_mtuinfo); 1305 sin6_t *sin6; 1306 struct ip6_mtuinfo *mtuinfo; 1307 1308 /* 1309 * If the application has requested to receive path mtu 1310 * information, send up an empty message containing an 1311 * IPV6_PATHMTU ancillary data item. 1312 */ 1313 if (!connp->conn_ipv6_recvpathmtu) 1314 break; 1315 1316 udi_size = sizeof (struct T_unitdata_ind) + sizeof (sin6_t) + 1317 opt_length; 1318 if ((newmp = allocb(udi_size, BPRI_MED)) == NULL) { 1319 BUMP_MIB(&icmp->icmp_is->is_rawip_mib, rawipInErrors); 1320 break; 1321 } 1322 1323 /* 1324 * newmp->b_cont is left to NULL on purpose. This is an 1325 * empty message containing only ancillary data. 1326 */ 1327 newmp->b_datap->db_type = M_PROTO; 1328 tudi = (struct T_unitdata_ind *)newmp->b_rptr; 1329 newmp->b_wptr = (uchar_t *)tudi + udi_size; 1330 tudi->PRIM_type = T_UNITDATA_IND; 1331 tudi->SRC_length = sizeof (sin6_t); 1332 tudi->SRC_offset = sizeof (struct T_unitdata_ind); 1333 tudi->OPT_offset = tudi->SRC_offset + sizeof (sin6_t); 1334 tudi->OPT_length = opt_length; 1335 1336 sin6 = (sin6_t *)&tudi[1]; 1337 bzero(sin6, sizeof (sin6_t)); 1338 sin6->sin6_family = AF_INET6; 1339 sin6->sin6_addr = connp->conn_faddr_v6; 1340 1341 toh = (struct T_opthdr *)&sin6[1]; 1342 toh->level = IPPROTO_IPV6; 1343 toh->name = IPV6_PATHMTU; 1344 toh->len = opt_length; 1345 toh->status = 0; 1346 1347 mtuinfo = (struct ip6_mtuinfo *)&toh[1]; 1348 bzero(mtuinfo, sizeof (struct ip6_mtuinfo)); 1349 mtuinfo->ip6m_addr.sin6_family = AF_INET6; 1350 mtuinfo->ip6m_addr.sin6_addr = ip6h->ip6_dst; 1351 mtuinfo->ip6m_mtu = icmp6->icmp6_mtu; 1352 /* 1353 * We've consumed everything we need from the original 1354 * message. Free it, then send our empty message. 1355 */ 1356 freemsg(mp); 1357 icmp_ulp_recv(connp, newmp, msgdsize(newmp)); 1358 return; 1359 } 1360 case ICMP6_TIME_EXCEEDED: 1361 /* Transient errors */ 1362 break; 1363 case ICMP6_PARAM_PROB: 1364 /* If this corresponds to an ICMP_PROTOCOL_UNREACHABLE */ 1365 if (icmp6->icmp6_code == ICMP6_PARAMPROB_NEXTHEADER && 1366 (uchar_t *)ip6h + icmp6->icmp6_pptr == 1367 (uchar_t *)nexthdrp) { 1368 error = ECONNREFUSED; 1369 break; 1370 } 1371 break; 1372 } 1373 if (error == 0) { 1374 freemsg(mp); 1375 return; 1376 } 1377 1378 /* 1379 * Deliver T_UDERROR_IND when the application has asked for it. 1380 * The socket layer enables this automatically when connected. 1381 */ 1382 if (!connp->conn_dgram_errind) { 1383 freemsg(mp); 1384 return; 1385 } 1386 1387 sin6 = sin6_null; 1388 sin6.sin6_family = AF_INET6; 1389 sin6.sin6_addr = ip6h->ip6_dst; 1390 sin6.sin6_flowinfo = ip6h->ip6_vcf & ~IPV6_VERS_AND_FLOW_MASK; 1391 if (IPCL_IS_NONSTR(connp)) { 1392 mutex_enter(&connp->conn_lock); 1393 if (icmp->icmp_state == TS_DATA_XFER) { 1394 if (IN6_ARE_ADDR_EQUAL(&sin6.sin6_addr, 1395 &connp->conn_faddr_v6)) { 1396 mutex_exit(&connp->conn_lock); 1397 (*connp->conn_upcalls->su_set_error) 1398 (connp->conn_upper_handle, error); 1399 goto done; 1400 } 1401 } else { 1402 icmp->icmp_delayed_error = error; 1403 *((sin6_t *)&icmp->icmp_delayed_addr) = sin6; 1404 } 1405 mutex_exit(&connp->conn_lock); 1406 } else { 1407 mp1 = mi_tpi_uderror_ind((char *)&sin6, sizeof (sin6_t), 1408 NULL, 0, error); 1409 if (mp1 != NULL) 1410 putnext(connp->conn_rq, mp1); 1411 } 1412 done: 1413 freemsg(mp); 1414 } 1415 1416 /* 1417 * This routine responds to T_ADDR_REQ messages. It is called by icmp_wput. 1418 * The local address is filled in if endpoint is bound. The remote address 1419 * is filled in if remote address has been precified ("connected endpoint") 1420 * (The concept of connected CLTS sockets is alien to published TPI 1421 * but we support it anyway). 1422 */ 1423 static void 1424 icmp_addr_req(queue_t *q, mblk_t *mp) 1425 { 1426 struct sockaddr *sa; 1427 mblk_t *ackmp; 1428 struct T_addr_ack *taa; 1429 icmp_t *icmp = Q_TO_ICMP(q); 1430 conn_t *connp = icmp->icmp_connp; 1431 uint_t addrlen; 1432 1433 /* Make it large enough for worst case */ 1434 ackmp = reallocb(mp, sizeof (struct T_addr_ack) + 1435 2 * sizeof (sin6_t), 1); 1436 if (ackmp == NULL) { 1437 icmp_err_ack(q, mp, TSYSERR, ENOMEM); 1438 return; 1439 } 1440 taa = (struct T_addr_ack *)ackmp->b_rptr; 1441 1442 bzero(taa, sizeof (struct T_addr_ack)); 1443 ackmp->b_wptr = (uchar_t *)&taa[1]; 1444 1445 taa->PRIM_type = T_ADDR_ACK; 1446 ackmp->b_datap->db_type = M_PCPROTO; 1447 1448 if (connp->conn_family == AF_INET) 1449 addrlen = sizeof (sin_t); 1450 else 1451 addrlen = sizeof (sin6_t); 1452 1453 mutex_enter(&connp->conn_lock); 1454 /* 1455 * Note: Following code assumes 32 bit alignment of basic 1456 * data structures like sin_t and struct T_addr_ack. 1457 */ 1458 if (icmp->icmp_state != TS_UNBND) { 1459 /* 1460 * Fill in local address first 1461 */ 1462 taa->LOCADDR_offset = sizeof (*taa); 1463 taa->LOCADDR_length = addrlen; 1464 sa = (struct sockaddr *)&taa[1]; 1465 (void) conn_getsockname(connp, sa, &addrlen); 1466 ackmp->b_wptr += addrlen; 1467 } 1468 if (icmp->icmp_state == TS_DATA_XFER) { 1469 /* 1470 * connected, fill remote address too 1471 */ 1472 taa->REMADDR_length = addrlen; 1473 /* assumed 32-bit alignment */ 1474 taa->REMADDR_offset = taa->LOCADDR_offset + taa->LOCADDR_length; 1475 sa = (struct sockaddr *)(ackmp->b_rptr + taa->REMADDR_offset); 1476 (void) conn_getpeername(connp, sa, &addrlen); 1477 ackmp->b_wptr += addrlen; 1478 } 1479 mutex_exit(&connp->conn_lock); 1480 ASSERT(ackmp->b_wptr <= ackmp->b_datap->db_lim); 1481 qreply(q, ackmp); 1482 } 1483 1484 static void 1485 icmp_copy_info(struct T_info_ack *tap, icmp_t *icmp) 1486 { 1487 conn_t *connp = icmp->icmp_connp; 1488 1489 *tap = icmp_g_t_info_ack; 1490 1491 if (connp->conn_family == AF_INET6) 1492 tap->ADDR_size = sizeof (sin6_t); 1493 else 1494 tap->ADDR_size = sizeof (sin_t); 1495 tap->CURRENT_state = icmp->icmp_state; 1496 tap->OPT_size = icmp_max_optsize; 1497 } 1498 1499 static void 1500 icmp_do_capability_ack(icmp_t *icmp, struct T_capability_ack *tcap, 1501 t_uscalar_t cap_bits1) 1502 { 1503 tcap->CAP_bits1 = 0; 1504 1505 if (cap_bits1 & TC1_INFO) { 1506 icmp_copy_info(&tcap->INFO_ack, icmp); 1507 tcap->CAP_bits1 |= TC1_INFO; 1508 } 1509 } 1510 1511 /* 1512 * This routine responds to T_CAPABILITY_REQ messages. It is called by 1513 * icmp_wput. Much of the T_CAPABILITY_ACK information is copied from 1514 * icmp_g_t_info_ack. The current state of the stream is copied from 1515 * icmp_state. 1516 */ 1517 static void 1518 icmp_capability_req(queue_t *q, mblk_t *mp) 1519 { 1520 icmp_t *icmp = Q_TO_ICMP(q); 1521 t_uscalar_t cap_bits1; 1522 struct T_capability_ack *tcap; 1523 1524 cap_bits1 = ((struct T_capability_req *)mp->b_rptr)->CAP_bits1; 1525 1526 mp = tpi_ack_alloc(mp, sizeof (struct T_capability_ack), 1527 mp->b_datap->db_type, T_CAPABILITY_ACK); 1528 if (!mp) 1529 return; 1530 1531 tcap = (struct T_capability_ack *)mp->b_rptr; 1532 1533 icmp_do_capability_ack(icmp, tcap, cap_bits1); 1534 1535 qreply(q, mp); 1536 } 1537 1538 /* 1539 * This routine responds to T_INFO_REQ messages. It is called by icmp_wput. 1540 * Most of the T_INFO_ACK information is copied from icmp_g_t_info_ack. 1541 * The current state of the stream is copied from icmp_state. 1542 */ 1543 static void 1544 icmp_info_req(queue_t *q, mblk_t *mp) 1545 { 1546 icmp_t *icmp = Q_TO_ICMP(q); 1547 1548 /* Create a T_INFO_ACK message. */ 1549 mp = tpi_ack_alloc(mp, sizeof (struct T_info_ack), M_PCPROTO, 1550 T_INFO_ACK); 1551 if (!mp) 1552 return; 1553 icmp_copy_info((struct T_info_ack *)mp->b_rptr, icmp); 1554 qreply(q, mp); 1555 } 1556 1557 static int 1558 icmp_tpi_open(queue_t *q, dev_t *devp, int flag, int sflag, cred_t *credp, 1559 int family) 1560 { 1561 conn_t *connp; 1562 dev_t conn_dev; 1563 int error; 1564 1565 /* If the stream is already open, return immediately. */ 1566 if (q->q_ptr != NULL) 1567 return (0); 1568 1569 if (sflag == MODOPEN) 1570 return (EINVAL); 1571 1572 /* 1573 * Since ICMP is not used so heavily, allocating from the small 1574 * arena should be sufficient. 1575 */ 1576 if ((conn_dev = inet_minor_alloc(ip_minor_arena_sa)) == 0) { 1577 return (EBUSY); 1578 } 1579 1580 if (flag & SO_FALLBACK) { 1581 /* 1582 * Non streams socket needs a stream to fallback to 1583 */ 1584 RD(q)->q_ptr = (void *)conn_dev; 1585 WR(q)->q_qinfo = &icmp_fallback_sock_winit; 1586 WR(q)->q_ptr = (void *)ip_minor_arena_sa; 1587 qprocson(q); 1588 return (0); 1589 } 1590 1591 connp = rawip_do_open(family, credp, &error, KM_SLEEP); 1592 if (connp == NULL) { 1593 ASSERT(error != 0); 1594 inet_minor_free(ip_minor_arena_sa, connp->conn_dev); 1595 return (error); 1596 } 1597 1598 *devp = makedevice(getemajor(*devp), (minor_t)conn_dev); 1599 connp->conn_dev = conn_dev; 1600 connp->conn_minor_arena = ip_minor_arena_sa; 1601 1602 /* 1603 * Initialize the icmp_t structure for this stream. 1604 */ 1605 q->q_ptr = connp; 1606 WR(q)->q_ptr = connp; 1607 connp->conn_rq = q; 1608 connp->conn_wq = WR(q); 1609 1610 WR(q)->q_hiwat = connp->conn_sndbuf; 1611 WR(q)->q_lowat = connp->conn_sndlowat; 1612 1613 qprocson(q); 1614 1615 /* Set the Stream head write offset. */ 1616 (void) proto_set_tx_wroff(q, connp, connp->conn_wroff); 1617 (void) proto_set_rx_hiwat(connp->conn_rq, connp, connp->conn_rcvbuf); 1618 1619 mutex_enter(&connp->conn_lock); 1620 connp->conn_state_flags &= ~CONN_INCIPIENT; 1621 mutex_exit(&connp->conn_lock); 1622 1623 icmp_bind_proto(connp->conn_icmp); 1624 1625 return (0); 1626 } 1627 1628 /* For /dev/icmp aka AF_INET open */ 1629 static int 1630 icmp_openv4(queue_t *q, dev_t *devp, int flag, int sflag, cred_t *credp) 1631 { 1632 return (icmp_tpi_open(q, devp, flag, sflag, credp, AF_INET)); 1633 } 1634 1635 /* For /dev/icmp6 aka AF_INET6 open */ 1636 static int 1637 icmp_openv6(queue_t *q, dev_t *devp, int flag, int sflag, cred_t *credp) 1638 { 1639 return (icmp_tpi_open(q, devp, flag, sflag, credp, AF_INET6)); 1640 } 1641 1642 /* 1643 * This is the open routine for icmp. It allocates a icmp_t structure for 1644 * the stream and, on the first open of the module, creates an ND table. 1645 */ 1646 static conn_t * 1647 rawip_do_open(int family, cred_t *credp, int *err, int flags) 1648 { 1649 icmp_t *icmp; 1650 conn_t *connp; 1651 zoneid_t zoneid; 1652 netstack_t *ns; 1653 icmp_stack_t *is; 1654 int len; 1655 boolean_t isv6 = B_FALSE; 1656 1657 *err = secpolicy_net_icmpaccess(credp); 1658 if (*err != 0) 1659 return (NULL); 1660 1661 if (family == AF_INET6) 1662 isv6 = B_TRUE; 1663 1664 ns = netstack_find_by_cred(credp); 1665 ASSERT(ns != NULL); 1666 is = ns->netstack_icmp; 1667 ASSERT(is != NULL); 1668 1669 /* 1670 * For exclusive stacks we set the zoneid to zero 1671 * to make ICMP operate as if in the global zone. 1672 */ 1673 if (ns->netstack_stackid != GLOBAL_NETSTACKID) 1674 zoneid = GLOBAL_ZONEID; 1675 else 1676 zoneid = crgetzoneid(credp); 1677 1678 ASSERT(flags == KM_SLEEP || flags == KM_NOSLEEP); 1679 1680 connp = ipcl_conn_create(IPCL_RAWIPCONN, flags, ns); 1681 icmp = connp->conn_icmp; 1682 1683 /* 1684 * ipcl_conn_create did a netstack_hold. Undo the hold that was 1685 * done by netstack_find_by_cred() 1686 */ 1687 netstack_rele(ns); 1688 1689 /* 1690 * Since this conn_t/icmp_t is not yet visible to anybody else we don't 1691 * need to lock anything. 1692 */ 1693 ASSERT(connp->conn_proto == IPPROTO_ICMP); 1694 ASSERT(connp->conn_icmp == icmp); 1695 ASSERT(icmp->icmp_connp == connp); 1696 1697 /* Set the initial state of the stream and the privilege status. */ 1698 icmp->icmp_state = TS_UNBND; 1699 connp->conn_ixa->ixa_flags |= IXAF_VERIFY_SOURCE; 1700 if (isv6) { 1701 connp->conn_family = AF_INET6; 1702 connp->conn_ipversion = IPV6_VERSION; 1703 connp->conn_ixa->ixa_flags &= ~IXAF_IS_IPV4; 1704 connp->conn_proto = IPPROTO_ICMPV6; 1705 /* May be changed by a SO_PROTOTYPE socket option. */ 1706 connp->conn_proto = IPPROTO_ICMPV6; 1707 connp->conn_ixa->ixa_protocol = connp->conn_proto; 1708 connp->conn_ixa->ixa_raw_cksum_offset = 2; 1709 connp->conn_default_ttl = is->is_ipv6_hoplimit; 1710 len = sizeof (ip6_t); 1711 } else { 1712 connp->conn_family = AF_INET; 1713 connp->conn_ipversion = IPV4_VERSION; 1714 connp->conn_ixa->ixa_flags |= IXAF_IS_IPV4; 1715 /* May be changed by a SO_PROTOTYPE socket option. */ 1716 connp->conn_proto = IPPROTO_ICMP; 1717 connp->conn_ixa->ixa_protocol = connp->conn_proto; 1718 connp->conn_default_ttl = is->is_ipv4_ttl; 1719 len = sizeof (ipha_t); 1720 } 1721 connp->conn_xmit_ipp.ipp_unicast_hops = connp->conn_default_ttl; 1722 1723 connp->conn_ixa->ixa_multicast_ttl = IP_DEFAULT_MULTICAST_TTL; 1724 1725 /* 1726 * For the socket of protocol IPPROTO_RAW or when IP_HDRINCL is set, 1727 * the checksum is provided in the pre-built packet. We clear 1728 * IXAF_SET_ULP_CKSUM to tell IP that the application has sent a 1729 * complete IP header and not to compute the transport checksum. 1730 */ 1731 connp->conn_ixa->ixa_flags |= IXAF_MULTICAST_LOOP | IXAF_SET_ULP_CKSUM; 1732 /* conn_allzones can not be set this early, hence no IPCL_ZONEID */ 1733 connp->conn_ixa->ixa_zoneid = zoneid; 1734 1735 connp->conn_zoneid = zoneid; 1736 1737 /* 1738 * If the caller has the process-wide flag set, then default to MAC 1739 * exempt mode. This allows read-down to unlabeled hosts. 1740 */ 1741 if (getpflags(NET_MAC_AWARE, credp) != 0) 1742 connp->conn_mac_mode = CONN_MAC_AWARE; 1743 1744 connp->conn_zone_is_global = (crgetzoneid(credp) == GLOBAL_ZONEID); 1745 1746 icmp->icmp_is = is; 1747 1748 connp->conn_rcvbuf = is->is_recv_hiwat; 1749 connp->conn_sndbuf = is->is_xmit_hiwat; 1750 connp->conn_sndlowat = is->is_xmit_lowat; 1751 connp->conn_rcvlowat = icmp_mod_info.mi_lowat; 1752 1753 connp->conn_wroff = len + is->is_wroff_extra; 1754 connp->conn_so_type = SOCK_RAW; 1755 1756 connp->conn_recv = icmp_input; 1757 connp->conn_recvicmp = icmp_icmp_input; 1758 crhold(credp); 1759 connp->conn_cred = credp; 1760 connp->conn_cpid = curproc->p_pid; 1761 connp->conn_open_time = ddi_get_lbolt64(); 1762 /* Cache things in ixa without an extra refhold */ 1763 ASSERT(!(connp->conn_ixa->ixa_free_flags & IXA_FREE_CRED)); 1764 connp->conn_ixa->ixa_cred = connp->conn_cred; 1765 connp->conn_ixa->ixa_cpid = connp->conn_cpid; 1766 if (is_system_labeled()) 1767 connp->conn_ixa->ixa_tsl = crgetlabel(connp->conn_cred); 1768 1769 connp->conn_flow_cntrld = B_FALSE; 1770 1771 if (is->is_pmtu_discovery) 1772 connp->conn_ixa->ixa_flags |= IXAF_PMTU_DISCOVERY; 1773 1774 return (connp); 1775 } 1776 1777 /* 1778 * Which ICMP options OK to set through T_UNITDATA_REQ... 1779 */ 1780 /* ARGSUSED */ 1781 static boolean_t 1782 icmp_opt_allow_udr_set(t_scalar_t level, t_scalar_t name) 1783 { 1784 return (B_TRUE); 1785 } 1786 1787 /* 1788 * This routine gets default values of certain options whose default 1789 * values are maintained by protcol specific code 1790 */ 1791 int 1792 icmp_opt_default(queue_t *q, t_scalar_t level, t_scalar_t name, uchar_t *ptr) 1793 { 1794 icmp_t *icmp = Q_TO_ICMP(q); 1795 icmp_stack_t *is = icmp->icmp_is; 1796 int *i1 = (int *)ptr; 1797 1798 switch (level) { 1799 case IPPROTO_IP: 1800 switch (name) { 1801 case IP_MULTICAST_TTL: 1802 *ptr = (uchar_t)IP_DEFAULT_MULTICAST_TTL; 1803 return (sizeof (uchar_t)); 1804 case IP_MULTICAST_LOOP: 1805 *ptr = (uchar_t)IP_DEFAULT_MULTICAST_LOOP; 1806 return (sizeof (uchar_t)); 1807 } 1808 break; 1809 case IPPROTO_IPV6: 1810 switch (name) { 1811 case IPV6_MULTICAST_HOPS: 1812 *i1 = IP_DEFAULT_MULTICAST_TTL; 1813 return (sizeof (int)); 1814 case IPV6_MULTICAST_LOOP: 1815 *i1 = IP_DEFAULT_MULTICAST_LOOP; 1816 return (sizeof (int)); 1817 case IPV6_UNICAST_HOPS: 1818 *i1 = is->is_ipv6_hoplimit; 1819 return (sizeof (int)); 1820 } 1821 break; 1822 case IPPROTO_ICMPV6: 1823 switch (name) { 1824 case ICMP6_FILTER: 1825 /* Make it look like "pass all" */ 1826 ICMP6_FILTER_SETPASSALL((icmp6_filter_t *)ptr); 1827 return (sizeof (icmp6_filter_t)); 1828 } 1829 break; 1830 } 1831 return (-1); 1832 } 1833 1834 /* 1835 * This routine retrieves the current status of socket options. 1836 * It returns the size of the option retrieved, or -1. 1837 */ 1838 int 1839 icmp_opt_get(conn_t *connp, int level, int name, uchar_t *ptr) 1840 { 1841 icmp_t *icmp = connp->conn_icmp; 1842 int *i1 = (int *)ptr; 1843 conn_opt_arg_t coas; 1844 int retval; 1845 1846 coas.coa_connp = connp; 1847 coas.coa_ixa = connp->conn_ixa; 1848 coas.coa_ipp = &connp->conn_xmit_ipp; 1849 coas.coa_ancillary = B_FALSE; 1850 coas.coa_changed = 0; 1851 1852 /* 1853 * We assume that the optcom framework has checked for the set 1854 * of levels and names that are supported, hence we don't worry 1855 * about rejecting based on that. 1856 * First check for ICMP specific handling, then pass to common routine. 1857 */ 1858 switch (level) { 1859 case IPPROTO_IP: 1860 /* 1861 * Only allow IPv4 option processing on IPv4 sockets. 1862 */ 1863 if (connp->conn_family != AF_INET) 1864 return (-1); 1865 1866 switch (name) { 1867 case IP_OPTIONS: 1868 case T_IP_OPTIONS: 1869 /* Options are passed up with each packet */ 1870 return (0); 1871 case IP_HDRINCL: 1872 mutex_enter(&connp->conn_lock); 1873 *i1 = (int)icmp->icmp_hdrincl; 1874 mutex_exit(&connp->conn_lock); 1875 return (sizeof (int)); 1876 } 1877 break; 1878 1879 case IPPROTO_IPV6: 1880 /* 1881 * Only allow IPv6 option processing on native IPv6 sockets. 1882 */ 1883 if (connp->conn_family != AF_INET6) 1884 return (-1); 1885 1886 switch (name) { 1887 case IPV6_CHECKSUM: 1888 /* 1889 * Return offset or -1 if no checksum offset. 1890 * Does not apply to IPPROTO_ICMPV6 1891 */ 1892 if (connp->conn_proto == IPPROTO_ICMPV6) 1893 return (-1); 1894 1895 mutex_enter(&connp->conn_lock); 1896 if (connp->conn_ixa->ixa_flags & IXAF_SET_RAW_CKSUM) 1897 *i1 = connp->conn_ixa->ixa_raw_cksum_offset; 1898 else 1899 *i1 = -1; 1900 mutex_exit(&connp->conn_lock); 1901 return (sizeof (int)); 1902 } 1903 break; 1904 1905 case IPPROTO_ICMPV6: 1906 /* 1907 * Only allow IPv6 option processing on native IPv6 sockets. 1908 */ 1909 if (connp->conn_family != AF_INET6) 1910 return (-1); 1911 1912 if (connp->conn_proto != IPPROTO_ICMPV6) 1913 return (-1); 1914 1915 switch (name) { 1916 case ICMP6_FILTER: 1917 mutex_enter(&connp->conn_lock); 1918 if (icmp->icmp_filter == NULL) { 1919 /* Make it look like "pass all" */ 1920 ICMP6_FILTER_SETPASSALL((icmp6_filter_t *)ptr); 1921 } else { 1922 (void) bcopy(icmp->icmp_filter, ptr, 1923 sizeof (icmp6_filter_t)); 1924 } 1925 mutex_exit(&connp->conn_lock); 1926 return (sizeof (icmp6_filter_t)); 1927 } 1928 } 1929 mutex_enter(&connp->conn_lock); 1930 retval = conn_opt_get(&coas, level, name, ptr); 1931 mutex_exit(&connp->conn_lock); 1932 return (retval); 1933 } 1934 1935 /* 1936 * This routine retrieves the current status of socket options. 1937 * It returns the size of the option retrieved, or -1. 1938 */ 1939 int 1940 icmp_tpi_opt_get(queue_t *q, int level, int name, uchar_t *ptr) 1941 { 1942 conn_t *connp = Q_TO_CONN(q); 1943 int err; 1944 1945 err = icmp_opt_get(connp, level, name, ptr); 1946 return (err); 1947 } 1948 1949 /* 1950 * This routine sets socket options. 1951 */ 1952 int 1953 icmp_do_opt_set(conn_opt_arg_t *coa, int level, int name, 1954 uint_t inlen, uchar_t *invalp, cred_t *cr, boolean_t checkonly) 1955 { 1956 conn_t *connp = coa->coa_connp; 1957 ip_xmit_attr_t *ixa = coa->coa_ixa; 1958 icmp_t *icmp = connp->conn_icmp; 1959 icmp_stack_t *is = icmp->icmp_is; 1960 int *i1 = (int *)invalp; 1961 boolean_t onoff = (*i1 == 0) ? 0 : 1; 1962 int error; 1963 1964 ASSERT(MUTEX_NOT_HELD(&coa->coa_connp->conn_lock)); 1965 1966 /* 1967 * For fixed length options, no sanity check 1968 * of passed in length is done. It is assumed *_optcom_req() 1969 * routines do the right thing. 1970 */ 1971 1972 switch (level) { 1973 case SOL_SOCKET: 1974 switch (name) { 1975 case SO_PROTOTYPE: 1976 if ((*i1 & 0xFF) != IPPROTO_ICMP && 1977 (*i1 & 0xFF) != IPPROTO_ICMPV6 && 1978 secpolicy_net_rawaccess(cr) != 0) { 1979 return (EACCES); 1980 } 1981 if (checkonly) 1982 break; 1983 1984 mutex_enter(&connp->conn_lock); 1985 connp->conn_proto = *i1 & 0xFF; 1986 ixa->ixa_protocol = connp->conn_proto; 1987 if ((connp->conn_proto == IPPROTO_RAW || 1988 connp->conn_proto == IPPROTO_IGMP) && 1989 connp->conn_family == AF_INET) { 1990 icmp->icmp_hdrincl = 1; 1991 ixa->ixa_flags &= ~IXAF_SET_ULP_CKSUM; 1992 } else if (connp->conn_proto == IPPROTO_UDP || 1993 connp->conn_proto == IPPROTO_TCP || 1994 connp->conn_proto == IPPROTO_SCTP) { 1995 /* Used by test applications like psh */ 1996 icmp->icmp_hdrincl = 0; 1997 ixa->ixa_flags &= ~IXAF_SET_ULP_CKSUM; 1998 } else { 1999 icmp->icmp_hdrincl = 0; 2000 ixa->ixa_flags |= IXAF_SET_ULP_CKSUM; 2001 } 2002 2003 if (connp->conn_family == AF_INET6 && 2004 connp->conn_proto == IPPROTO_ICMPV6) { 2005 /* Set offset for icmp6_cksum */ 2006 ixa->ixa_flags &= ~IXAF_SET_RAW_CKSUM; 2007 ixa->ixa_raw_cksum_offset = 2; 2008 } 2009 if (icmp->icmp_filter != NULL && 2010 connp->conn_proto != IPPROTO_ICMPV6) { 2011 kmem_free(icmp->icmp_filter, 2012 sizeof (icmp6_filter_t)); 2013 icmp->icmp_filter = NULL; 2014 } 2015 mutex_exit(&connp->conn_lock); 2016 2017 coa->coa_changed |= COA_HEADER_CHANGED; 2018 /* 2019 * For SCTP, we don't use icmp_bind_proto() for 2020 * raw socket binding. 2021 */ 2022 if (connp->conn_proto == IPPROTO_SCTP) 2023 return (0); 2024 2025 coa->coa_changed |= COA_ICMP_BIND_NEEDED; 2026 return (0); 2027 2028 case SO_SNDBUF: 2029 if (*i1 > is->is_max_buf) { 2030 return (ENOBUFS); 2031 } 2032 break; 2033 case SO_RCVBUF: 2034 if (*i1 > is->is_max_buf) { 2035 return (ENOBUFS); 2036 } 2037 break; 2038 } 2039 break; 2040 2041 case IPPROTO_IP: 2042 /* 2043 * Only allow IPv4 option processing on IPv4 sockets. 2044 */ 2045 if (connp->conn_family != AF_INET) 2046 return (EINVAL); 2047 2048 switch (name) { 2049 case IP_HDRINCL: 2050 if (!checkonly) { 2051 mutex_enter(&connp->conn_lock); 2052 icmp->icmp_hdrincl = onoff; 2053 if (onoff) 2054 ixa->ixa_flags &= ~IXAF_SET_ULP_CKSUM; 2055 else 2056 ixa->ixa_flags |= IXAF_SET_ULP_CKSUM; 2057 mutex_exit(&connp->conn_lock); 2058 } 2059 break; 2060 } 2061 break; 2062 2063 case IPPROTO_IPV6: 2064 if (connp->conn_family != AF_INET6) 2065 return (EINVAL); 2066 2067 switch (name) { 2068 case IPV6_CHECKSUM: 2069 /* 2070 * Integer offset into the user data of where the 2071 * checksum is located. 2072 * Offset of -1 disables option. 2073 * Does not apply to IPPROTO_ICMPV6. 2074 */ 2075 if (connp->conn_proto == IPPROTO_ICMPV6 || 2076 coa->coa_ancillary) { 2077 return (EINVAL); 2078 } 2079 if ((*i1 != -1) && ((*i1 < 0) || (*i1 & 0x1) != 0)) { 2080 /* Negative or not 16 bit aligned offset */ 2081 return (EINVAL); 2082 } 2083 if (checkonly) 2084 break; 2085 2086 mutex_enter(&connp->conn_lock); 2087 if (*i1 == -1) { 2088 ixa->ixa_flags &= ~IXAF_SET_RAW_CKSUM; 2089 ixa->ixa_raw_cksum_offset = 0; 2090 ixa->ixa_flags &= ~IXAF_SET_ULP_CKSUM; 2091 } else { 2092 ixa->ixa_flags |= IXAF_SET_RAW_CKSUM; 2093 ixa->ixa_raw_cksum_offset = *i1; 2094 ixa->ixa_flags |= IXAF_SET_ULP_CKSUM; 2095 } 2096 mutex_exit(&connp->conn_lock); 2097 break; 2098 } 2099 break; 2100 2101 case IPPROTO_ICMPV6: 2102 /* 2103 * Only allow IPv6 option processing on IPv6 sockets. 2104 */ 2105 if (connp->conn_family != AF_INET6) 2106 return (EINVAL); 2107 if (connp->conn_proto != IPPROTO_ICMPV6) 2108 return (EINVAL); 2109 2110 switch (name) { 2111 case ICMP6_FILTER: 2112 if (checkonly) 2113 break; 2114 2115 if ((inlen != 0) && 2116 (inlen != sizeof (icmp6_filter_t))) 2117 return (EINVAL); 2118 2119 mutex_enter(&connp->conn_lock); 2120 if (inlen == 0) { 2121 if (icmp->icmp_filter != NULL) { 2122 kmem_free(icmp->icmp_filter, 2123 sizeof (icmp6_filter_t)); 2124 icmp->icmp_filter = NULL; 2125 } 2126 } else { 2127 if (icmp->icmp_filter == NULL) { 2128 icmp->icmp_filter = kmem_alloc( 2129 sizeof (icmp6_filter_t), 2130 KM_NOSLEEP); 2131 if (icmp->icmp_filter == NULL) { 2132 mutex_exit(&connp->conn_lock); 2133 return (ENOBUFS); 2134 } 2135 } 2136 (void) bcopy(invalp, icmp->icmp_filter, inlen); 2137 } 2138 mutex_exit(&connp->conn_lock); 2139 break; 2140 } 2141 break; 2142 } 2143 error = conn_opt_set(coa, level, name, inlen, invalp, 2144 checkonly, cr); 2145 return (error); 2146 } 2147 2148 /* 2149 * This routine sets socket options. 2150 */ 2151 int 2152 icmp_opt_set(conn_t *connp, uint_t optset_context, int level, int name, 2153 uint_t inlen, uchar_t *invalp, uint_t *outlenp, uchar_t *outvalp, 2154 void *thisdg_attrs, cred_t *cr) 2155 { 2156 icmp_t *icmp = connp->conn_icmp; 2157 int err; 2158 conn_opt_arg_t coas, *coa; 2159 boolean_t checkonly; 2160 icmp_stack_t *is = icmp->icmp_is; 2161 2162 switch (optset_context) { 2163 case SETFN_OPTCOM_CHECKONLY: 2164 checkonly = B_TRUE; 2165 /* 2166 * Note: Implies T_CHECK semantics for T_OPTCOM_REQ 2167 * inlen != 0 implies value supplied and 2168 * we have to "pretend" to set it. 2169 * inlen == 0 implies that there is no 2170 * value part in T_CHECK request and just validation 2171 * done elsewhere should be enough, we just return here. 2172 */ 2173 if (inlen == 0) { 2174 *outlenp = 0; 2175 return (0); 2176 } 2177 break; 2178 case SETFN_OPTCOM_NEGOTIATE: 2179 checkonly = B_FALSE; 2180 break; 2181 case SETFN_UD_NEGOTIATE: 2182 case SETFN_CONN_NEGOTIATE: 2183 checkonly = B_FALSE; 2184 /* 2185 * Negotiating local and "association-related" options 2186 * through T_UNITDATA_REQ. 2187 * 2188 * Following routine can filter out ones we do not 2189 * want to be "set" this way. 2190 */ 2191 if (!icmp_opt_allow_udr_set(level, name)) { 2192 *outlenp = 0; 2193 return (EINVAL); 2194 } 2195 break; 2196 default: 2197 /* 2198 * We should never get here 2199 */ 2200 *outlenp = 0; 2201 return (EINVAL); 2202 } 2203 2204 ASSERT((optset_context != SETFN_OPTCOM_CHECKONLY) || 2205 (optset_context == SETFN_OPTCOM_CHECKONLY && inlen != 0)); 2206 2207 if (thisdg_attrs != NULL) { 2208 /* Options from T_UNITDATA_REQ */ 2209 coa = (conn_opt_arg_t *)thisdg_attrs; 2210 ASSERT(coa->coa_connp == connp); 2211 ASSERT(coa->coa_ixa != NULL); 2212 ASSERT(coa->coa_ipp != NULL); 2213 ASSERT(coa->coa_ancillary); 2214 } else { 2215 coa = &coas; 2216 coas.coa_connp = connp; 2217 /* Get a reference on conn_ixa to prevent concurrent mods */ 2218 coas.coa_ixa = conn_get_ixa(connp, B_TRUE); 2219 if (coas.coa_ixa == NULL) { 2220 *outlenp = 0; 2221 return (ENOMEM); 2222 } 2223 coas.coa_ipp = &connp->conn_xmit_ipp; 2224 coas.coa_ancillary = B_FALSE; 2225 coas.coa_changed = 0; 2226 } 2227 2228 err = icmp_do_opt_set(coa, level, name, inlen, invalp, 2229 cr, checkonly); 2230 if (err != 0) { 2231 errout: 2232 if (!coa->coa_ancillary) 2233 ixa_refrele(coa->coa_ixa); 2234 *outlenp = 0; 2235 return (err); 2236 } 2237 2238 /* 2239 * Common case of OK return with outval same as inval. 2240 */ 2241 if (invalp != outvalp) { 2242 /* don't trust bcopy for identical src/dst */ 2243 (void) bcopy(invalp, outvalp, inlen); 2244 } 2245 *outlenp = inlen; 2246 2247 /* 2248 * If this was not ancillary data, then we rebuild the headers, 2249 * update the IRE/NCE, and IPsec as needed. 2250 * Since the label depends on the destination we go through 2251 * ip_set_destination first. 2252 */ 2253 if (coa->coa_ancillary) { 2254 return (0); 2255 } 2256 2257 if (coa->coa_changed & COA_ROUTE_CHANGED) { 2258 in6_addr_t saddr, faddr, nexthop; 2259 in_port_t fport; 2260 2261 /* 2262 * We clear lastdst to make sure we pick up the change 2263 * next time sending. 2264 * If we are connected we re-cache the information. 2265 * We ignore errors to preserve BSD behavior. 2266 * Note that we don't redo IPsec policy lookup here 2267 * since the final destination (or source) didn't change. 2268 */ 2269 mutex_enter(&connp->conn_lock); 2270 connp->conn_v6lastdst = ipv6_all_zeros; 2271 2272 ip_attr_nexthop(coa->coa_ipp, coa->coa_ixa, 2273 &connp->conn_faddr_v6, &nexthop); 2274 saddr = connp->conn_saddr_v6; 2275 faddr = connp->conn_faddr_v6; 2276 fport = connp->conn_fport; 2277 mutex_exit(&connp->conn_lock); 2278 2279 if (!IN6_IS_ADDR_UNSPECIFIED(&faddr) && 2280 !IN6_IS_ADDR_V4MAPPED_ANY(&faddr)) { 2281 (void) ip_attr_connect(connp, coa->coa_ixa, 2282 &saddr, &faddr, &nexthop, fport, NULL, NULL, 2283 IPDF_ALLOW_MCBC | IPDF_VERIFY_DST); 2284 } 2285 } 2286 2287 ixa_refrele(coa->coa_ixa); 2288 2289 if (coa->coa_changed & COA_HEADER_CHANGED) { 2290 /* 2291 * Rebuild the header template if we are connected. 2292 * Otherwise clear conn_v6lastdst so we rebuild the header 2293 * in the data path. 2294 */ 2295 mutex_enter(&connp->conn_lock); 2296 if (!IN6_IS_ADDR_UNSPECIFIED(&connp->conn_faddr_v6) && 2297 !IN6_IS_ADDR_V4MAPPED_ANY(&connp->conn_faddr_v6)) { 2298 err = icmp_build_hdr_template(connp, 2299 &connp->conn_saddr_v6, &connp->conn_faddr_v6, 2300 connp->conn_flowinfo); 2301 if (err != 0) { 2302 mutex_exit(&connp->conn_lock); 2303 return (err); 2304 } 2305 } else { 2306 connp->conn_v6lastdst = ipv6_all_zeros; 2307 } 2308 mutex_exit(&connp->conn_lock); 2309 } 2310 if (coa->coa_changed & COA_RCVBUF_CHANGED) { 2311 (void) proto_set_rx_hiwat(connp->conn_rq, connp, 2312 connp->conn_rcvbuf); 2313 } 2314 if ((coa->coa_changed & COA_SNDBUF_CHANGED) && !IPCL_IS_NONSTR(connp)) { 2315 connp->conn_wq->q_hiwat = connp->conn_sndbuf; 2316 } 2317 if (coa->coa_changed & COA_WROFF_CHANGED) { 2318 /* Increase wroff if needed */ 2319 uint_t wroff; 2320 2321 mutex_enter(&connp->conn_lock); 2322 wroff = connp->conn_ht_iphc_allocated + is->is_wroff_extra; 2323 if (wroff > connp->conn_wroff) { 2324 connp->conn_wroff = wroff; 2325 mutex_exit(&connp->conn_lock); 2326 (void) proto_set_tx_wroff(connp->conn_rq, connp, wroff); 2327 } else { 2328 mutex_exit(&connp->conn_lock); 2329 } 2330 } 2331 if (coa->coa_changed & COA_ICMP_BIND_NEEDED) { 2332 icmp_bind_proto(icmp); 2333 } 2334 return (err); 2335 } 2336 2337 /* This routine sets socket options. */ 2338 int 2339 icmp_tpi_opt_set(queue_t *q, uint_t optset_context, int level, int name, 2340 uint_t inlen, uchar_t *invalp, uint_t *outlenp, uchar_t *outvalp, 2341 void *thisdg_attrs, cred_t *cr) 2342 { 2343 conn_t *connp = Q_TO_CONN(q); 2344 int error; 2345 2346 error = icmp_opt_set(connp, optset_context, level, name, inlen, invalp, 2347 outlenp, outvalp, thisdg_attrs, cr); 2348 return (error); 2349 } 2350 2351 /* 2352 * Setup IP headers. 2353 * 2354 * Note that IP_HDRINCL has ipha_protocol that is different than conn_proto, 2355 * but icmp_output_hdrincl restores ipha_protocol once we return. 2356 */ 2357 mblk_t * 2358 icmp_prepend_hdr(conn_t *connp, ip_xmit_attr_t *ixa, const ip_pkt_t *ipp, 2359 const in6_addr_t *v6src, const in6_addr_t *v6dst, uint32_t flowinfo, 2360 mblk_t *data_mp, int *errorp) 2361 { 2362 mblk_t *mp; 2363 icmp_stack_t *is = connp->conn_netstack->netstack_icmp; 2364 uint_t data_len; 2365 uint32_t cksum; 2366 2367 data_len = msgdsize(data_mp); 2368 mp = conn_prepend_hdr(ixa, ipp, v6src, v6dst, connp->conn_proto, 2369 flowinfo, 0, data_mp, data_len, is->is_wroff_extra, &cksum, errorp); 2370 if (mp == NULL) { 2371 ASSERT(*errorp != 0); 2372 return (NULL); 2373 } 2374 2375 ixa->ixa_pktlen = data_len + ixa->ixa_ip_hdr_length; 2376 2377 /* 2378 * If there was a routing option/header then conn_prepend_hdr 2379 * has massaged it and placed the pseudo-header checksum difference 2380 * in the cksum argument. 2381 * 2382 * Prepare for ICMPv6 checksum done in IP. 2383 * 2384 * We make it easy for IP to include our pseudo header 2385 * by putting our length (and any routing header adjustment) 2386 * in the ICMPv6 checksum field. 2387 * The IP source, destination, and length have already been set by 2388 * conn_prepend_hdr. 2389 */ 2390 cksum += data_len; 2391 cksum = (cksum >> 16) + (cksum & 0xFFFF); 2392 ASSERT(cksum < 0x10000); 2393 2394 if (ixa->ixa_flags & IXAF_IS_IPV4) { 2395 ipha_t *ipha = (ipha_t *)mp->b_rptr; 2396 2397 ASSERT(ntohs(ipha->ipha_length) == ixa->ixa_pktlen); 2398 } else { 2399 ip6_t *ip6h = (ip6_t *)mp->b_rptr; 2400 uint_t cksum_offset = 0; 2401 2402 ASSERT(ntohs(ip6h->ip6_plen) + IPV6_HDR_LEN == ixa->ixa_pktlen); 2403 2404 if (ixa->ixa_flags & IXAF_SET_ULP_CKSUM) { 2405 if (connp->conn_proto == IPPROTO_ICMPV6) { 2406 cksum_offset = ixa->ixa_ip_hdr_length + 2407 offsetof(icmp6_t, icmp6_cksum); 2408 } else if (ixa->ixa_flags & IXAF_SET_RAW_CKSUM) { 2409 cksum_offset = ixa->ixa_ip_hdr_length + 2410 ixa->ixa_raw_cksum_offset; 2411 } 2412 } 2413 if (cksum_offset != 0) { 2414 uint16_t *ptr; 2415 2416 /* Make sure the checksum fits in the first mblk */ 2417 if (cksum_offset + sizeof (short) > MBLKL(mp)) { 2418 mblk_t *mp1; 2419 2420 mp1 = msgpullup(mp, 2421 cksum_offset + sizeof (short)); 2422 freemsg(mp); 2423 if (mp1 == NULL) { 2424 *errorp = ENOMEM; 2425 return (NULL); 2426 } 2427 mp = mp1; 2428 ip6h = (ip6_t *)mp->b_rptr; 2429 } 2430 ptr = (uint16_t *)(mp->b_rptr + cksum_offset); 2431 *ptr = htons(cksum); 2432 } 2433 } 2434 2435 /* Note that we don't try to update wroff due to ancillary data */ 2436 return (mp); 2437 } 2438 2439 static int 2440 icmp_build_hdr_template(conn_t *connp, const in6_addr_t *v6src, 2441 const in6_addr_t *v6dst, uint32_t flowinfo) 2442 { 2443 int error; 2444 2445 ASSERT(MUTEX_HELD(&connp->conn_lock)); 2446 /* 2447 * We clear lastdst to make sure we don't use the lastdst path 2448 * next time sending since we might not have set v6dst yet. 2449 */ 2450 connp->conn_v6lastdst = ipv6_all_zeros; 2451 2452 error = conn_build_hdr_template(connp, 0, 0, v6src, v6dst, flowinfo); 2453 if (error != 0) 2454 return (error); 2455 2456 /* 2457 * Any routing header/option has been massaged. The checksum difference 2458 * is stored in conn_sum. 2459 */ 2460 return (0); 2461 } 2462 2463 static mblk_t * 2464 icmp_queue_fallback(icmp_t *icmp, mblk_t *mp) 2465 { 2466 ASSERT(MUTEX_HELD(&icmp->icmp_recv_lock)); 2467 if (IPCL_IS_NONSTR(icmp->icmp_connp)) { 2468 /* 2469 * fallback has started but messages have not been moved yet 2470 */ 2471 if (icmp->icmp_fallback_queue_head == NULL) { 2472 ASSERT(icmp->icmp_fallback_queue_tail == NULL); 2473 icmp->icmp_fallback_queue_head = mp; 2474 icmp->icmp_fallback_queue_tail = mp; 2475 } else { 2476 ASSERT(icmp->icmp_fallback_queue_tail != NULL); 2477 icmp->icmp_fallback_queue_tail->b_next = mp; 2478 icmp->icmp_fallback_queue_tail = mp; 2479 } 2480 return (NULL); 2481 } else { 2482 /* 2483 * Fallback completed, let the caller putnext() the mblk. 2484 */ 2485 return (mp); 2486 } 2487 } 2488 2489 /* 2490 * Deliver data to ULP. In case we have a socket, and it's falling back to 2491 * TPI, then we'll queue the mp for later processing. 2492 */ 2493 static void 2494 icmp_ulp_recv(conn_t *connp, mblk_t *mp, uint_t len) 2495 { 2496 if (IPCL_IS_NONSTR(connp)) { 2497 icmp_t *icmp = connp->conn_icmp; 2498 int error; 2499 2500 ASSERT(len == msgdsize(mp)); 2501 if ((*connp->conn_upcalls->su_recv) 2502 (connp->conn_upper_handle, mp, len, 0, &error, NULL) < 0) { 2503 mutex_enter(&icmp->icmp_recv_lock); 2504 if (error == ENOSPC) { 2505 /* 2506 * let's confirm while holding the lock 2507 */ 2508 if ((*connp->conn_upcalls->su_recv) 2509 (connp->conn_upper_handle, NULL, 0, 0, 2510 &error, NULL) < 0) { 2511 ASSERT(error == ENOSPC); 2512 if (error == ENOSPC) { 2513 connp->conn_flow_cntrld = 2514 B_TRUE; 2515 } 2516 } 2517 mutex_exit(&icmp->icmp_recv_lock); 2518 } else { 2519 ASSERT(error == EOPNOTSUPP); 2520 mp = icmp_queue_fallback(icmp, mp); 2521 mutex_exit(&icmp->icmp_recv_lock); 2522 if (mp != NULL) 2523 putnext(connp->conn_rq, mp); 2524 } 2525 } 2526 ASSERT(MUTEX_NOT_HELD(&icmp->icmp_recv_lock)); 2527 } else { 2528 putnext(connp->conn_rq, mp); 2529 } 2530 } 2531 2532 /* 2533 * This is the inbound data path. 2534 * IP has already pulled up the IP headers and verified alignment 2535 * etc. 2536 */ 2537 /* ARGSUSED2 */ 2538 static void 2539 icmp_input(void *arg1, mblk_t *mp, void *arg2, ip_recv_attr_t *ira) 2540 { 2541 conn_t *connp = (conn_t *)arg1; 2542 struct T_unitdata_ind *tudi; 2543 uchar_t *rptr; /* Pointer to IP header */ 2544 int ip_hdr_length; 2545 int udi_size; /* Size of T_unitdata_ind */ 2546 int pkt_len; 2547 icmp_t *icmp; 2548 ip_pkt_t ipps; 2549 ip6_t *ip6h; 2550 mblk_t *mp1; 2551 crb_t recv_ancillary; 2552 icmp_stack_t *is; 2553 sin_t *sin; 2554 sin6_t *sin6; 2555 ipha_t *ipha; 2556 2557 ASSERT(connp->conn_flags & IPCL_RAWIPCONN); 2558 2559 icmp = connp->conn_icmp; 2560 is = icmp->icmp_is; 2561 rptr = mp->b_rptr; 2562 2563 ASSERT(DB_TYPE(mp) == M_DATA); 2564 ASSERT(OK_32PTR(rptr)); 2565 ASSERT(ira->ira_pktlen == msgdsize(mp)); 2566 pkt_len = ira->ira_pktlen; 2567 2568 /* 2569 * Get a snapshot of these and allow other threads to change 2570 * them after that. We need the same recv_ancillary when determining 2571 * the size as when adding the ancillary data items. 2572 */ 2573 mutex_enter(&connp->conn_lock); 2574 recv_ancillary = connp->conn_recv_ancillary; 2575 mutex_exit(&connp->conn_lock); 2576 2577 ip_hdr_length = ira->ira_ip_hdr_length; 2578 ASSERT(MBLKL(mp) >= ip_hdr_length); /* IP did a pullup */ 2579 2580 /* Initialize regardless of IP version */ 2581 ipps.ipp_fields = 0; 2582 2583 if (ira->ira_flags & IRAF_IS_IPV4) { 2584 ASSERT(IPH_HDR_VERSION(rptr) == IPV4_VERSION); 2585 ASSERT(MBLKL(mp) >= sizeof (ipha_t)); 2586 ASSERT(ira->ira_ip_hdr_length == IPH_HDR_LENGTH(rptr)); 2587 2588 ipha = (ipha_t *)mp->b_rptr; 2589 if (recv_ancillary.crb_all != 0) 2590 (void) ip_find_hdr_v4(ipha, &ipps, B_FALSE); 2591 2592 /* 2593 * BSD for some reason adjusts ipha_length to exclude the 2594 * IP header length. We do the same. 2595 */ 2596 if (is->is_bsd_compat) { 2597 ushort_t len; 2598 2599 len = ntohs(ipha->ipha_length); 2600 if (mp->b_datap->db_ref > 1) { 2601 /* 2602 * Allocate a new IP header so that we can 2603 * modify ipha_length. 2604 */ 2605 mblk_t *mp1; 2606 2607 mp1 = allocb(ip_hdr_length, BPRI_MED); 2608 if (mp1 == NULL) { 2609 freemsg(mp); 2610 BUMP_MIB(&is->is_rawip_mib, 2611 rawipInErrors); 2612 return; 2613 } 2614 bcopy(rptr, mp1->b_rptr, ip_hdr_length); 2615 mp->b_rptr = rptr + ip_hdr_length; 2616 rptr = mp1->b_rptr; 2617 ipha = (ipha_t *)rptr; 2618 mp1->b_cont = mp; 2619 mp1->b_wptr = rptr + ip_hdr_length; 2620 mp = mp1; 2621 } 2622 len -= ip_hdr_length; 2623 ipha->ipha_length = htons(len); 2624 } 2625 2626 /* 2627 * For RAW sockets we not pass ICMP/IPv4 packets to AF_INET6 2628 * sockets. This is ensured by icmp_bind and the IP fanout code. 2629 */ 2630 ASSERT(connp->conn_family == AF_INET); 2631 2632 /* 2633 * This is the inbound data path. Packets are passed upstream 2634 * as T_UNITDATA_IND messages with full IPv4 headers still 2635 * attached. 2636 */ 2637 2638 /* 2639 * Normally only send up the source address. 2640 * If any ancillary data items are wanted we add those. 2641 */ 2642 udi_size = sizeof (struct T_unitdata_ind) + sizeof (sin_t); 2643 if (recv_ancillary.crb_all != 0) { 2644 udi_size += conn_recvancillary_size(connp, 2645 recv_ancillary, ira, mp, &ipps); 2646 } 2647 2648 /* Allocate a message block for the T_UNITDATA_IND structure. */ 2649 mp1 = allocb(udi_size, BPRI_MED); 2650 if (mp1 == NULL) { 2651 freemsg(mp); 2652 BUMP_MIB(&is->is_rawip_mib, rawipInErrors); 2653 return; 2654 } 2655 mp1->b_cont = mp; 2656 tudi = (struct T_unitdata_ind *)mp1->b_rptr; 2657 mp1->b_datap->db_type = M_PROTO; 2658 mp1->b_wptr = (uchar_t *)tudi + udi_size; 2659 tudi->PRIM_type = T_UNITDATA_IND; 2660 tudi->SRC_length = sizeof (sin_t); 2661 tudi->SRC_offset = sizeof (struct T_unitdata_ind); 2662 sin = (sin_t *)&tudi[1]; 2663 *sin = sin_null; 2664 sin->sin_family = AF_INET; 2665 sin->sin_addr.s_addr = ipha->ipha_src; 2666 *(uint32_t *)&sin->sin_zero[0] = 0; 2667 *(uint32_t *)&sin->sin_zero[4] = 0; 2668 tudi->OPT_offset = sizeof (struct T_unitdata_ind) + 2669 sizeof (sin_t); 2670 udi_size -= (sizeof (struct T_unitdata_ind) + sizeof (sin_t)); 2671 tudi->OPT_length = udi_size; 2672 2673 /* 2674 * Add options if IP_RECVIF etc is set 2675 */ 2676 if (udi_size != 0) { 2677 conn_recvancillary_add(connp, recv_ancillary, ira, 2678 &ipps, (uchar_t *)&sin[1], udi_size); 2679 } 2680 goto deliver; 2681 } 2682 2683 ASSERT(IPH_HDR_VERSION(rptr) == IPV6_VERSION); 2684 /* 2685 * IPv6 packets can only be received by applications 2686 * that are prepared to receive IPv6 addresses. 2687 * The IP fanout must ensure this. 2688 */ 2689 ASSERT(connp->conn_family == AF_INET6); 2690 2691 /* 2692 * Handle IPv6 packets. We don't pass up the IP headers with the 2693 * payload for IPv6. 2694 */ 2695 2696 ip6h = (ip6_t *)rptr; 2697 if (recv_ancillary.crb_all != 0) { 2698 /* 2699 * Call on ip_find_hdr_v6 which gets individual lenghts of 2700 * extension headers (and pointers to them). 2701 */ 2702 uint8_t nexthdr; 2703 2704 /* We don't care about the length or nextheader. */ 2705 (void) ip_find_hdr_v6(mp, ip6h, B_TRUE, &ipps, &nexthdr); 2706 2707 /* 2708 * We do not pass up hop-by-hop options or any other 2709 * extension header as part of the packet. Applications 2710 * that want to see them have to specify IPV6_RECV* socket 2711 * options. And conn_recvancillary_size/add explicitly 2712 * drops the TX option from IPV6_HOPOPTS as it does for UDP. 2713 * 2714 * If we had multilevel ICMP sockets, then we'd want to 2715 * modify conn_recvancillary_size/add to 2716 * allow the user to see the label. 2717 */ 2718 } 2719 2720 /* 2721 * Check a filter for ICMPv6 types if needed. 2722 * Verify raw checksums if needed. 2723 */ 2724 mutex_enter(&connp->conn_lock); 2725 if (icmp->icmp_filter != NULL) { 2726 int type; 2727 2728 /* Assumes that IP has done the pullupmsg */ 2729 type = mp->b_rptr[ip_hdr_length]; 2730 2731 ASSERT(mp->b_rptr + ip_hdr_length <= mp->b_wptr); 2732 if (ICMP6_FILTER_WILLBLOCK(type, icmp->icmp_filter)) { 2733 mutex_exit(&connp->conn_lock); 2734 freemsg(mp); 2735 return; 2736 } 2737 } 2738 if (connp->conn_ixa->ixa_flags & IXAF_SET_RAW_CKSUM) { 2739 /* Checksum */ 2740 uint16_t *up; 2741 uint32_t sum; 2742 int remlen; 2743 2744 up = (uint16_t *)&ip6h->ip6_src; 2745 2746 remlen = msgdsize(mp) - ip_hdr_length; 2747 sum = htons(connp->conn_proto + remlen) 2748 + up[0] + up[1] + up[2] + up[3] 2749 + up[4] + up[5] + up[6] + up[7] 2750 + up[8] + up[9] + up[10] + up[11] 2751 + up[12] + up[13] + up[14] + up[15]; 2752 sum = (sum & 0xffff) + (sum >> 16); 2753 sum = IP_CSUM(mp, ip_hdr_length, sum); 2754 if (sum != 0) { 2755 /* IPv6 RAW checksum failed */ 2756 ip0dbg(("icmp_rput: RAW checksum failed %x\n", sum)); 2757 mutex_exit(&connp->conn_lock); 2758 freemsg(mp); 2759 BUMP_MIB(&is->is_rawip_mib, rawipInCksumErrs); 2760 return; 2761 } 2762 } 2763 mutex_exit(&connp->conn_lock); 2764 2765 udi_size = sizeof (struct T_unitdata_ind) + sizeof (sin6_t); 2766 2767 if (recv_ancillary.crb_all != 0) { 2768 udi_size += conn_recvancillary_size(connp, 2769 recv_ancillary, ira, mp, &ipps); 2770 } 2771 2772 mp1 = allocb(udi_size, BPRI_MED); 2773 if (mp1 == NULL) { 2774 freemsg(mp); 2775 BUMP_MIB(&is->is_rawip_mib, rawipInErrors); 2776 return; 2777 } 2778 mp1->b_cont = mp; 2779 mp1->b_datap->db_type = M_PROTO; 2780 tudi = (struct T_unitdata_ind *)mp1->b_rptr; 2781 mp1->b_wptr = (uchar_t *)tudi + udi_size; 2782 tudi->PRIM_type = T_UNITDATA_IND; 2783 tudi->SRC_length = sizeof (sin6_t); 2784 tudi->SRC_offset = sizeof (struct T_unitdata_ind); 2785 tudi->OPT_offset = sizeof (struct T_unitdata_ind) + sizeof (sin6_t); 2786 udi_size -= (sizeof (struct T_unitdata_ind) + sizeof (sin6_t)); 2787 tudi->OPT_length = udi_size; 2788 sin6 = (sin6_t *)&tudi[1]; 2789 *sin6 = sin6_null; 2790 sin6->sin6_port = 0; 2791 sin6->sin6_family = AF_INET6; 2792 2793 sin6->sin6_addr = ip6h->ip6_src; 2794 /* No sin6_flowinfo per API */ 2795 sin6->sin6_flowinfo = 0; 2796 /* For link-scope pass up scope id */ 2797 if (IN6_IS_ADDR_LINKSCOPE(&ip6h->ip6_src)) 2798 sin6->sin6_scope_id = ira->ira_ruifindex; 2799 else 2800 sin6->sin6_scope_id = 0; 2801 sin6->__sin6_src_id = ip_srcid_find_addr(&ip6h->ip6_dst, 2802 IPCL_ZONEID(connp), is->is_netstack); 2803 2804 if (udi_size != 0) { 2805 conn_recvancillary_add(connp, recv_ancillary, ira, 2806 &ipps, (uchar_t *)&sin6[1], udi_size); 2807 } 2808 2809 /* Skip all the IPv6 headers per API */ 2810 mp->b_rptr += ip_hdr_length; 2811 pkt_len -= ip_hdr_length; 2812 2813 deliver: 2814 BUMP_MIB(&is->is_rawip_mib, rawipInDatagrams); 2815 icmp_ulp_recv(connp, mp1, pkt_len); 2816 } 2817 2818 /* 2819 * return SNMP stuff in buffer in mpdata. We don't hold any lock and report 2820 * information that can be changing beneath us. 2821 */ 2822 mblk_t * 2823 icmp_snmp_get(queue_t *q, mblk_t *mpctl) 2824 { 2825 mblk_t *mpdata; 2826 struct opthdr *optp; 2827 conn_t *connp = Q_TO_CONN(q); 2828 icmp_stack_t *is = connp->conn_netstack->netstack_icmp; 2829 mblk_t *mp2ctl; 2830 2831 /* 2832 * make a copy of the original message 2833 */ 2834 mp2ctl = copymsg(mpctl); 2835 2836 if (mpctl == NULL || 2837 (mpdata = mpctl->b_cont) == NULL) { 2838 freemsg(mpctl); 2839 freemsg(mp2ctl); 2840 return (0); 2841 } 2842 2843 /* fixed length structure for IPv4 and IPv6 counters */ 2844 optp = (struct opthdr *)&mpctl->b_rptr[sizeof (struct T_optmgmt_ack)]; 2845 optp->level = EXPER_RAWIP; 2846 optp->name = 0; 2847 (void) snmp_append_data(mpdata, (char *)&is->is_rawip_mib, 2848 sizeof (is->is_rawip_mib)); 2849 optp->len = msgdsize(mpdata); 2850 qreply(q, mpctl); 2851 2852 return (mp2ctl); 2853 } 2854 2855 /* 2856 * Return 0 if invalid set request, 1 otherwise, including non-rawip requests. 2857 * TODO: If this ever actually tries to set anything, it needs to be 2858 * to do the appropriate locking. 2859 */ 2860 /* ARGSUSED */ 2861 int 2862 icmp_snmp_set(queue_t *q, t_scalar_t level, t_scalar_t name, 2863 uchar_t *ptr, int len) 2864 { 2865 switch (level) { 2866 case EXPER_RAWIP: 2867 return (0); 2868 default: 2869 return (1); 2870 } 2871 } 2872 2873 /* 2874 * This routine creates a T_UDERROR_IND message and passes it upstream. 2875 * The address and options are copied from the T_UNITDATA_REQ message 2876 * passed in mp. This message is freed. 2877 */ 2878 static void 2879 icmp_ud_err(queue_t *q, mblk_t *mp, t_scalar_t err) 2880 { 2881 struct T_unitdata_req *tudr; 2882 mblk_t *mp1; 2883 uchar_t *destaddr; 2884 t_scalar_t destlen; 2885 uchar_t *optaddr; 2886 t_scalar_t optlen; 2887 2888 if ((mp->b_wptr < mp->b_rptr) || 2889 (MBLKL(mp)) < sizeof (struct T_unitdata_req)) { 2890 goto done; 2891 } 2892 tudr = (struct T_unitdata_req *)mp->b_rptr; 2893 destaddr = mp->b_rptr + tudr->DEST_offset; 2894 if (destaddr < mp->b_rptr || destaddr >= mp->b_wptr || 2895 destaddr + tudr->DEST_length < mp->b_rptr || 2896 destaddr + tudr->DEST_length > mp->b_wptr) { 2897 goto done; 2898 } 2899 optaddr = mp->b_rptr + tudr->OPT_offset; 2900 if (optaddr < mp->b_rptr || optaddr >= mp->b_wptr || 2901 optaddr + tudr->OPT_length < mp->b_rptr || 2902 optaddr + tudr->OPT_length > mp->b_wptr) { 2903 goto done; 2904 } 2905 destlen = tudr->DEST_length; 2906 optlen = tudr->OPT_length; 2907 2908 mp1 = mi_tpi_uderror_ind((char *)destaddr, destlen, 2909 (char *)optaddr, optlen, err); 2910 if (mp1 != NULL) 2911 qreply(q, mp1); 2912 2913 done: 2914 freemsg(mp); 2915 } 2916 2917 static int 2918 rawip_do_unbind(conn_t *connp) 2919 { 2920 icmp_t *icmp = connp->conn_icmp; 2921 2922 mutex_enter(&connp->conn_lock); 2923 /* If a bind has not been done, we can't unbind. */ 2924 if (icmp->icmp_state == TS_UNBND) { 2925 mutex_exit(&connp->conn_lock); 2926 return (-TOUTSTATE); 2927 } 2928 connp->conn_saddr_v6 = ipv6_all_zeros; 2929 connp->conn_bound_addr_v6 = ipv6_all_zeros; 2930 connp->conn_laddr_v6 = ipv6_all_zeros; 2931 connp->conn_mcbc_bind = B_FALSE; 2932 connp->conn_lport = 0; 2933 connp->conn_fport = 0; 2934 /* In case we were also connected */ 2935 connp->conn_faddr_v6 = ipv6_all_zeros; 2936 connp->conn_v6lastdst = ipv6_all_zeros; 2937 2938 icmp->icmp_state = TS_UNBND; 2939 2940 (void) icmp_build_hdr_template(connp, &connp->conn_saddr_v6, 2941 &connp->conn_faddr_v6, connp->conn_flowinfo); 2942 mutex_exit(&connp->conn_lock); 2943 2944 ip_unbind(connp); 2945 return (0); 2946 } 2947 2948 /* 2949 * This routine is called by icmp_wput to handle T_UNBIND_REQ messages. 2950 * After some error checking, the message is passed downstream to ip. 2951 */ 2952 static void 2953 icmp_tpi_unbind(queue_t *q, mblk_t *mp) 2954 { 2955 conn_t *connp = Q_TO_CONN(q); 2956 int error; 2957 2958 ASSERT(mp->b_cont == NULL); 2959 error = rawip_do_unbind(connp); 2960 if (error) { 2961 if (error < 0) { 2962 icmp_err_ack(q, mp, -error, 0); 2963 } else { 2964 icmp_err_ack(q, mp, 0, error); 2965 } 2966 return; 2967 } 2968 2969 /* 2970 * Convert mp into a T_OK_ACK 2971 */ 2972 2973 mp = mi_tpi_ok_ack_alloc(mp); 2974 2975 /* 2976 * should not happen in practice... T_OK_ACK is smaller than the 2977 * original message. 2978 */ 2979 ASSERT(mp != NULL); 2980 ASSERT(((struct T_ok_ack *)mp->b_rptr)->PRIM_type == T_OK_ACK); 2981 qreply(q, mp); 2982 } 2983 2984 /* 2985 * Process IPv4 packets that already include an IP header. 2986 * Used when IP_HDRINCL has been set (implicit for IPPROTO_RAW and 2987 * IPPROTO_IGMP). 2988 * In this case we ignore the address and any options in the T_UNITDATA_REQ. 2989 * 2990 * The packet is assumed to have a base (20 byte) IP header followed 2991 * by the upper-layer protocol. We include any IP_OPTIONS including a 2992 * CIPSO label but otherwise preserve the base IP header. 2993 */ 2994 static int 2995 icmp_output_hdrincl(conn_t *connp, mblk_t *mp, cred_t *cr, pid_t pid) 2996 { 2997 icmp_t *icmp = connp->conn_icmp; 2998 icmp_stack_t *is = icmp->icmp_is; 2999 ipha_t iphas; 3000 ipha_t *ipha; 3001 int ip_hdr_length; 3002 int tp_hdr_len; 3003 ip_xmit_attr_t *ixa; 3004 ip_pkt_t *ipp; 3005 in6_addr_t v6src; 3006 in6_addr_t v6dst; 3007 in6_addr_t v6nexthop; 3008 int error; 3009 boolean_t do_ipsec; 3010 3011 /* 3012 * We need an exclusive copy of conn_ixa since the included IP 3013 * header could have any destination. 3014 * That copy has no pointers hence we 3015 * need to set them up once we've parsed the ancillary data. 3016 */ 3017 ixa = conn_get_ixa_exclusive(connp); 3018 if (ixa == NULL) { 3019 BUMP_MIB(&is->is_rawip_mib, rawipOutErrors); 3020 freemsg(mp); 3021 return (ENOMEM); 3022 } 3023 ASSERT(cr != NULL); 3024 /* 3025 * Caller has a reference on cr; from db_credp or because we 3026 * are running in process context. 3027 */ 3028 ASSERT(!(ixa->ixa_free_flags & IXA_FREE_CRED)); 3029 ixa->ixa_cred = cr; 3030 ixa->ixa_cpid = pid; 3031 if (is_system_labeled()) { 3032 /* We need to restart with a label based on the cred */ 3033 ip_xmit_attr_restore_tsl(ixa, ixa->ixa_cred); 3034 } 3035 3036 /* In case previous destination was multicast or multirt */ 3037 ip_attr_newdst(ixa); 3038 3039 /* Get a copy of conn_xmit_ipp since the TX label might change it */ 3040 ipp = kmem_zalloc(sizeof (*ipp), KM_NOSLEEP); 3041 if (ipp == NULL) { 3042 ASSERT(!(ixa->ixa_free_flags & IXA_FREE_CRED)); 3043 ixa->ixa_cred = connp->conn_cred; /* Restore */ 3044 ixa->ixa_cpid = connp->conn_cpid; 3045 ixa_refrele(ixa); 3046 BUMP_MIB(&is->is_rawip_mib, rawipOutErrors); 3047 freemsg(mp); 3048 return (ENOMEM); 3049 } 3050 mutex_enter(&connp->conn_lock); 3051 error = ip_pkt_copy(&connp->conn_xmit_ipp, ipp, KM_NOSLEEP); 3052 mutex_exit(&connp->conn_lock); 3053 if (error != 0) { 3054 BUMP_MIB(&is->is_rawip_mib, rawipOutErrors); 3055 freemsg(mp); 3056 goto done; 3057 } 3058 3059 /* Sanity check length of packet */ 3060 ipha = (ipha_t *)mp->b_rptr; 3061 3062 ip_hdr_length = IP_SIMPLE_HDR_LENGTH; 3063 if ((mp->b_wptr - mp->b_rptr) < IP_SIMPLE_HDR_LENGTH) { 3064 if (!pullupmsg(mp, IP_SIMPLE_HDR_LENGTH)) { 3065 BUMP_MIB(&is->is_rawip_mib, rawipOutErrors); 3066 freemsg(mp); 3067 goto done; 3068 } 3069 ipha = (ipha_t *)mp->b_rptr; 3070 } 3071 ipha->ipha_version_and_hdr_length = 3072 (IP_VERSION<<4) | (ip_hdr_length>>2); 3073 3074 /* 3075 * We set IXAF_DONTFRAG if the application set DF which makes 3076 * IP not fragment. 3077 */ 3078 ipha->ipha_fragment_offset_and_flags &= htons(IPH_DF); 3079 if (ipha->ipha_fragment_offset_and_flags & htons(IPH_DF)) 3080 ixa->ixa_flags |= (IXAF_DONTFRAG | IXAF_PMTU_IPV4_DF); 3081 else 3082 ixa->ixa_flags &= ~(IXAF_DONTFRAG | IXAF_PMTU_IPV4_DF); 3083 3084 /* Even for multicast and broadcast we honor the apps ttl */ 3085 ixa->ixa_flags |= IXAF_NO_TTL_CHANGE; 3086 3087 /* 3088 * No source verification for non-local addresses 3089 */ 3090 if (ipha->ipha_src != INADDR_ANY && 3091 ip_laddr_verify_v4(ipha->ipha_src, ixa->ixa_zoneid, 3092 is->is_netstack->netstack_ip, B_FALSE) 3093 != IPVL_UNICAST_UP) { 3094 ixa->ixa_flags &= ~IXAF_VERIFY_SOURCE; 3095 } 3096 3097 if (ipha->ipha_dst == INADDR_ANY) 3098 ipha->ipha_dst = htonl(INADDR_LOOPBACK); 3099 3100 IN6_IPADDR_TO_V4MAPPED(ipha->ipha_src, &v6src); 3101 IN6_IPADDR_TO_V4MAPPED(ipha->ipha_dst, &v6dst); 3102 3103 /* Defer IPsec if it might need to look at ICMP type/code */ 3104 do_ipsec = ipha->ipha_protocol != IPPROTO_ICMP; 3105 ixa->ixa_flags |= IXAF_IS_IPV4; 3106 3107 ip_attr_nexthop(ipp, ixa, &v6dst, &v6nexthop); 3108 error = ip_attr_connect(connp, ixa, &v6src, &v6dst, &v6nexthop, 3109 connp->conn_fport, &v6src, NULL, IPDF_ALLOW_MCBC | IPDF_VERIFY_DST | 3110 (do_ipsec ? IPDF_IPSEC : 0)); 3111 switch (error) { 3112 case 0: 3113 break; 3114 case EADDRNOTAVAIL: 3115 /* 3116 * IXAF_VERIFY_SOURCE tells us to pick a better source. 3117 * Don't have the application see that errno 3118 */ 3119 error = ENETUNREACH; 3120 goto failed; 3121 case ENETDOWN: 3122 /* 3123 * Have !ipif_addr_ready address; drop packet silently 3124 * until we can get applications to not send until we 3125 * are ready. 3126 */ 3127 error = 0; 3128 goto failed; 3129 case EHOSTUNREACH: 3130 case ENETUNREACH: 3131 if (ixa->ixa_ire != NULL) { 3132 /* 3133 * Let conn_ip_output/ire_send_noroute return 3134 * the error and send any local ICMP error. 3135 */ 3136 error = 0; 3137 break; 3138 } 3139 /* FALLTHRU */ 3140 default: 3141 failed: 3142 freemsg(mp); 3143 BUMP_MIB(&is->is_rawip_mib, rawipOutErrors); 3144 goto done; 3145 } 3146 if (ipha->ipha_src == INADDR_ANY) 3147 IN6_V4MAPPED_TO_IPADDR(&v6src, ipha->ipha_src); 3148 3149 /* 3150 * We might be going to a different destination than last time, 3151 * thus check that TX allows the communication and compute any 3152 * needed label. 3153 * 3154 * TSOL Note: We have an exclusive ipp and ixa for this thread so we 3155 * don't have to worry about concurrent threads. 3156 */ 3157 if (is_system_labeled()) { 3158 /* 3159 * Check whether Trusted Solaris policy allows communication 3160 * with this host, and pretend that the destination is 3161 * unreachable if not. 3162 * Compute any needed label and place it in ipp_label_v4/v6. 3163 * 3164 * Later conn_build_hdr_template/conn_prepend_hdr takes 3165 * ipp_label_v4/v6 to form the packet. 3166 * 3167 * Tsol note: We have ipp structure local to this thread so 3168 * no locking is needed. 3169 */ 3170 error = conn_update_label(connp, ixa, &v6dst, ipp); 3171 if (error != 0) { 3172 freemsg(mp); 3173 BUMP_MIB(&is->is_rawip_mib, rawipOutErrors); 3174 goto done; 3175 } 3176 } 3177 3178 /* 3179 * Save away a copy of the IPv4 header the application passed down 3180 * and then prepend an IPv4 header complete with any IP options 3181 * including label. 3182 * We need a struct copy since icmp_prepend_hdr will reuse the available 3183 * space in the mblk. 3184 */ 3185 iphas = *ipha; 3186 mp->b_rptr += IP_SIMPLE_HDR_LENGTH; 3187 3188 mp = icmp_prepend_hdr(connp, ixa, ipp, &v6src, &v6dst, 0, mp, &error); 3189 if (mp == NULL) { 3190 BUMP_MIB(&is->is_rawip_mib, rawipOutErrors); 3191 ASSERT(error != 0); 3192 goto done; 3193 } 3194 if (ixa->ixa_pktlen > IP_MAXPACKET) { 3195 error = EMSGSIZE; 3196 BUMP_MIB(&is->is_rawip_mib, rawipOutErrors); 3197 freemsg(mp); 3198 goto done; 3199 } 3200 /* Restore key parts of the header that the application passed down */ 3201 ipha = (ipha_t *)mp->b_rptr; 3202 ipha->ipha_type_of_service = iphas.ipha_type_of_service; 3203 ipha->ipha_ident = iphas.ipha_ident; 3204 ipha->ipha_fragment_offset_and_flags = 3205 iphas.ipha_fragment_offset_and_flags; 3206 ipha->ipha_ttl = iphas.ipha_ttl; 3207 ipha->ipha_protocol = iphas.ipha_protocol; 3208 ipha->ipha_src = iphas.ipha_src; 3209 ipha->ipha_dst = iphas.ipha_dst; 3210 3211 ixa->ixa_protocol = ipha->ipha_protocol; 3212 3213 /* 3214 * Make sure that the IP header plus any transport header that is 3215 * checksumed by ip_output is in the first mblk. (ip_output assumes 3216 * that at least the checksum field is in the first mblk.) 3217 */ 3218 switch (ipha->ipha_protocol) { 3219 case IPPROTO_UDP: 3220 tp_hdr_len = 8; 3221 break; 3222 case IPPROTO_TCP: 3223 tp_hdr_len = 20; 3224 break; 3225 default: 3226 tp_hdr_len = 0; 3227 break; 3228 } 3229 ip_hdr_length = IPH_HDR_LENGTH(ipha); 3230 if (mp->b_wptr - mp->b_rptr < ip_hdr_length + tp_hdr_len) { 3231 if (!pullupmsg(mp, ip_hdr_length + tp_hdr_len)) { 3232 BUMP_MIB(&is->is_rawip_mib, rawipOutErrors); 3233 if (mp->b_cont == NULL) 3234 error = EINVAL; 3235 else 3236 error = ENOMEM; 3237 freemsg(mp); 3238 goto done; 3239 } 3240 } 3241 3242 if (!do_ipsec) { 3243 /* Policy might differ for different ICMP type/code */ 3244 if (ixa->ixa_ipsec_policy != NULL) { 3245 IPPOL_REFRELE(ixa->ixa_ipsec_policy); 3246 ixa->ixa_ipsec_policy = NULL; 3247 ixa->ixa_flags &= ~IXAF_IPSEC_SECURE; 3248 } 3249 mp = ip_output_attach_policy(mp, ipha, NULL, connp, ixa); 3250 if (mp == NULL) { 3251 BUMP_MIB(&is->is_rawip_mib, rawipOutErrors); 3252 error = EHOSTUNREACH; /* IPsec policy failure */ 3253 goto done; 3254 } 3255 } 3256 3257 /* We're done. Pass the packet to ip. */ 3258 BUMP_MIB(&is->is_rawip_mib, rawipOutDatagrams); 3259 3260 error = conn_ip_output(mp, ixa); 3261 /* No rawipOutErrors if an error since IP increases its error counter */ 3262 switch (error) { 3263 case 0: 3264 break; 3265 case EWOULDBLOCK: 3266 (void) ixa_check_drain_insert(connp, ixa); 3267 error = 0; 3268 break; 3269 case EADDRNOTAVAIL: 3270 /* 3271 * IXAF_VERIFY_SOURCE tells us to pick a better source. 3272 * Don't have the application see that errno 3273 */ 3274 error = ENETUNREACH; 3275 break; 3276 } 3277 done: 3278 ASSERT(!(ixa->ixa_free_flags & IXA_FREE_CRED)); 3279 ixa->ixa_cred = connp->conn_cred; /* Restore */ 3280 ixa->ixa_cpid = connp->conn_cpid; 3281 ixa_refrele(ixa); 3282 ip_pkt_free(ipp); 3283 kmem_free(ipp, sizeof (*ipp)); 3284 return (error); 3285 } 3286 3287 static mblk_t * 3288 icmp_output_attach_policy(mblk_t *mp, conn_t *connp, ip_xmit_attr_t *ixa) 3289 { 3290 ipha_t *ipha = NULL; 3291 ip6_t *ip6h = NULL; 3292 3293 if (ixa->ixa_flags & IXAF_IS_IPV4) 3294 ipha = (ipha_t *)mp->b_rptr; 3295 else 3296 ip6h = (ip6_t *)mp->b_rptr; 3297 3298 if (ixa->ixa_ipsec_policy != NULL) { 3299 IPPOL_REFRELE(ixa->ixa_ipsec_policy); 3300 ixa->ixa_ipsec_policy = NULL; 3301 ixa->ixa_flags &= ~IXAF_IPSEC_SECURE; 3302 } 3303 return (ip_output_attach_policy(mp, ipha, ip6h, connp, ixa)); 3304 } 3305 3306 /* 3307 * Handle T_UNITDATA_REQ with options. Both IPv4 and IPv6 3308 * Either tudr_mp or msg is set. If tudr_mp we take ancillary data from 3309 * the TPI options, otherwise we take them from msg_control. 3310 * If both sin and sin6 is set it is a connected socket and we use conn_faddr. 3311 * Always consumes mp; never consumes tudr_mp. 3312 */ 3313 static int 3314 icmp_output_ancillary(conn_t *connp, sin_t *sin, sin6_t *sin6, mblk_t *mp, 3315 mblk_t *tudr_mp, struct nmsghdr *msg, cred_t *cr, pid_t pid) 3316 { 3317 icmp_t *icmp = connp->conn_icmp; 3318 icmp_stack_t *is = icmp->icmp_is; 3319 int error; 3320 ip_xmit_attr_t *ixa; 3321 ip_pkt_t *ipp; 3322 in6_addr_t v6src; 3323 in6_addr_t v6dst; 3324 in6_addr_t v6nexthop; 3325 in_port_t dstport; 3326 uint32_t flowinfo; 3327 uint_t srcid; 3328 int is_absreq_failure = 0; 3329 conn_opt_arg_t coas, *coa; 3330 3331 ASSERT(tudr_mp != NULL || msg != NULL); 3332 3333 /* 3334 * Get ixa before checking state to handle a disconnect race. 3335 * 3336 * We need an exclusive copy of conn_ixa since the ancillary data 3337 * options might modify it. That copy has no pointers hence we 3338 * need to set them up once we've parsed the ancillary data. 3339 */ 3340 ixa = conn_get_ixa_exclusive(connp); 3341 if (ixa == NULL) { 3342 BUMP_MIB(&is->is_rawip_mib, rawipOutErrors); 3343 freemsg(mp); 3344 return (ENOMEM); 3345 } 3346 ASSERT(cr != NULL); 3347 ASSERT(!(ixa->ixa_free_flags & IXA_FREE_CRED)); 3348 ixa->ixa_cred = cr; 3349 ixa->ixa_cpid = pid; 3350 if (is_system_labeled()) { 3351 /* We need to restart with a label based on the cred */ 3352 ip_xmit_attr_restore_tsl(ixa, ixa->ixa_cred); 3353 } 3354 3355 /* In case previous destination was multicast or multirt */ 3356 ip_attr_newdst(ixa); 3357 3358 /* Get a copy of conn_xmit_ipp since the options might change it */ 3359 ipp = kmem_zalloc(sizeof (*ipp), KM_NOSLEEP); 3360 if (ipp == NULL) { 3361 ASSERT(!(ixa->ixa_free_flags & IXA_FREE_CRED)); 3362 ixa->ixa_cred = connp->conn_cred; /* Restore */ 3363 ixa->ixa_cpid = connp->conn_cpid; 3364 ixa_refrele(ixa); 3365 BUMP_MIB(&is->is_rawip_mib, rawipOutErrors); 3366 freemsg(mp); 3367 return (ENOMEM); 3368 } 3369 mutex_enter(&connp->conn_lock); 3370 error = ip_pkt_copy(&connp->conn_xmit_ipp, ipp, KM_NOSLEEP); 3371 mutex_exit(&connp->conn_lock); 3372 if (error != 0) { 3373 BUMP_MIB(&is->is_rawip_mib, rawipOutErrors); 3374 freemsg(mp); 3375 goto done; 3376 } 3377 3378 /* 3379 * Parse the options and update ixa and ipp as a result. 3380 */ 3381 3382 coa = &coas; 3383 coa->coa_connp = connp; 3384 coa->coa_ixa = ixa; 3385 coa->coa_ipp = ipp; 3386 coa->coa_ancillary = B_TRUE; 3387 coa->coa_changed = 0; 3388 3389 if (msg != NULL) { 3390 error = process_auxiliary_options(connp, msg->msg_control, 3391 msg->msg_controllen, coa, &icmp_opt_obj, icmp_opt_set, cr); 3392 } else { 3393 struct T_unitdata_req *tudr; 3394 3395 tudr = (struct T_unitdata_req *)tudr_mp->b_rptr; 3396 ASSERT(tudr->PRIM_type == T_UNITDATA_REQ); 3397 error = tpi_optcom_buf(connp->conn_wq, tudr_mp, 3398 &tudr->OPT_length, tudr->OPT_offset, cr, &icmp_opt_obj, 3399 coa, &is_absreq_failure); 3400 } 3401 if (error != 0) { 3402 /* 3403 * Note: No special action needed in this 3404 * module for "is_absreq_failure" 3405 */ 3406 freemsg(mp); 3407 BUMP_MIB(&is->is_rawip_mib, rawipOutErrors); 3408 goto done; 3409 } 3410 ASSERT(is_absreq_failure == 0); 3411 3412 mutex_enter(&connp->conn_lock); 3413 /* 3414 * If laddr is unspecified then we look at sin6_src_id. 3415 * We will give precedence to a source address set with IPV6_PKTINFO 3416 * (aka IPPF_ADDR) but that is handled in build_hdrs. However, we don't 3417 * want ip_attr_connect to select a source (since it can fail) when 3418 * IPV6_PKTINFO is specified. 3419 * If this doesn't result in a source address then we get a source 3420 * from ip_attr_connect() below. 3421 */ 3422 v6src = connp->conn_saddr_v6; 3423 if (sin != NULL) { 3424 IN6_IPADDR_TO_V4MAPPED(sin->sin_addr.s_addr, &v6dst); 3425 dstport = sin->sin_port; 3426 flowinfo = 0; 3427 ixa->ixa_flags &= ~IXAF_SCOPEID_SET; 3428 ixa->ixa_flags |= IXAF_IS_IPV4; 3429 } else if (sin6 != NULL) { 3430 v6dst = sin6->sin6_addr; 3431 dstport = sin6->sin6_port; 3432 flowinfo = sin6->sin6_flowinfo; 3433 srcid = sin6->__sin6_src_id; 3434 if (IN6_IS_ADDR_LINKSCOPE(&v6dst) && sin6->sin6_scope_id != 0) { 3435 ixa->ixa_scopeid = sin6->sin6_scope_id; 3436 ixa->ixa_flags |= IXAF_SCOPEID_SET; 3437 } else { 3438 ixa->ixa_flags &= ~IXAF_SCOPEID_SET; 3439 } 3440 if (srcid != 0 && IN6_IS_ADDR_UNSPECIFIED(&v6src)) { 3441 ip_srcid_find_id(srcid, &v6src, IPCL_ZONEID(connp), 3442 connp->conn_netstack); 3443 } 3444 if (IN6_IS_ADDR_V4MAPPED(&v6dst)) 3445 ixa->ixa_flags |= IXAF_IS_IPV4; 3446 else 3447 ixa->ixa_flags &= ~IXAF_IS_IPV4; 3448 } else { 3449 /* Connected case */ 3450 v6dst = connp->conn_faddr_v6; 3451 flowinfo = connp->conn_flowinfo; 3452 } 3453 mutex_exit(&connp->conn_lock); 3454 /* Handle IP_PKTINFO/IPV6_PKTINFO setting source address. */ 3455 if (ipp->ipp_fields & IPPF_ADDR) { 3456 if (ixa->ixa_flags & IXAF_IS_IPV4) { 3457 if (IN6_IS_ADDR_V4MAPPED(&ipp->ipp_addr)) 3458 v6src = ipp->ipp_addr; 3459 } else { 3460 if (!IN6_IS_ADDR_V4MAPPED(&ipp->ipp_addr)) 3461 v6src = ipp->ipp_addr; 3462 } 3463 } 3464 /* 3465 * Allow source not assigned to the system 3466 * only if it is not a local addresses 3467 */ 3468 if (!V6_OR_V4_INADDR_ANY(v6src)) { 3469 ip_laddr_t laddr_type; 3470 3471 if (ixa->ixa_flags & IXAF_IS_IPV4) { 3472 ipaddr_t v4src; 3473 3474 IN6_V4MAPPED_TO_IPADDR(&v6src, v4src); 3475 laddr_type = ip_laddr_verify_v4(v4src, ixa->ixa_zoneid, 3476 is->is_netstack->netstack_ip, B_FALSE); 3477 } else { 3478 laddr_type = ip_laddr_verify_v6(&v6src, ixa->ixa_zoneid, 3479 is->is_netstack->netstack_ip, B_FALSE, B_FALSE); 3480 } 3481 if (laddr_type != IPVL_UNICAST_UP) 3482 ixa->ixa_flags &= ~IXAF_VERIFY_SOURCE; 3483 } 3484 3485 ip_attr_nexthop(ipp, ixa, &v6dst, &v6nexthop); 3486 error = ip_attr_connect(connp, ixa, &v6src, &v6dst, &v6nexthop, dstport, 3487 &v6src, NULL, IPDF_ALLOW_MCBC | IPDF_VERIFY_DST); 3488 3489 switch (error) { 3490 case 0: 3491 break; 3492 case EADDRNOTAVAIL: 3493 /* 3494 * IXAF_VERIFY_SOURCE tells us to pick a better source. 3495 * Don't have the application see that errno 3496 */ 3497 error = ENETUNREACH; 3498 goto failed; 3499 case ENETDOWN: 3500 /* 3501 * Have !ipif_addr_ready address; drop packet silently 3502 * until we can get applications to not send until we 3503 * are ready. 3504 */ 3505 error = 0; 3506 goto failed; 3507 case EHOSTUNREACH: 3508 case ENETUNREACH: 3509 if (ixa->ixa_ire != NULL) { 3510 /* 3511 * Let conn_ip_output/ire_send_noroute return 3512 * the error and send any local ICMP error. 3513 */ 3514 error = 0; 3515 break; 3516 } 3517 /* FALLTHRU */ 3518 default: 3519 failed: 3520 freemsg(mp); 3521 BUMP_MIB(&is->is_rawip_mib, rawipOutErrors); 3522 goto done; 3523 } 3524 3525 /* 3526 * We might be going to a different destination than last time, 3527 * thus check that TX allows the communication and compute any 3528 * needed label. 3529 * 3530 * TSOL Note: We have an exclusive ipp and ixa for this thread so we 3531 * don't have to worry about concurrent threads. 3532 */ 3533 if (is_system_labeled()) { 3534 /* 3535 * Check whether Trusted Solaris policy allows communication 3536 * with this host, and pretend that the destination is 3537 * unreachable if not. 3538 * Compute any needed label and place it in ipp_label_v4/v6. 3539 * 3540 * Later conn_build_hdr_template/conn_prepend_hdr takes 3541 * ipp_label_v4/v6 to form the packet. 3542 * 3543 * Tsol note: We have ipp structure local to this thread so 3544 * no locking is needed. 3545 */ 3546 error = conn_update_label(connp, ixa, &v6dst, ipp); 3547 if (error != 0) { 3548 freemsg(mp); 3549 BUMP_MIB(&is->is_rawip_mib, rawipOutErrors); 3550 goto done; 3551 } 3552 } 3553 mp = icmp_prepend_hdr(connp, ixa, ipp, &v6src, &v6dst, flowinfo, mp, 3554 &error); 3555 if (mp == NULL) { 3556 BUMP_MIB(&is->is_rawip_mib, rawipOutErrors); 3557 ASSERT(error != 0); 3558 goto done; 3559 } 3560 if (ixa->ixa_pktlen > IP_MAXPACKET) { 3561 error = EMSGSIZE; 3562 BUMP_MIB(&is->is_rawip_mib, rawipOutErrors); 3563 freemsg(mp); 3564 goto done; 3565 } 3566 3567 /* Policy might differ for different ICMP type/code */ 3568 mp = icmp_output_attach_policy(mp, connp, ixa); 3569 if (mp == NULL) { 3570 BUMP_MIB(&is->is_rawip_mib, rawipOutErrors); 3571 error = EHOSTUNREACH; /* IPsec policy failure */ 3572 goto done; 3573 } 3574 3575 /* We're done. Pass the packet to ip. */ 3576 BUMP_MIB(&is->is_rawip_mib, rawipOutDatagrams); 3577 3578 error = conn_ip_output(mp, ixa); 3579 if (!connp->conn_unspec_src) 3580 ixa->ixa_flags |= IXAF_VERIFY_SOURCE; 3581 /* No rawipOutErrors if an error since IP increases its error counter */ 3582 switch (error) { 3583 case 0: 3584 break; 3585 case EWOULDBLOCK: 3586 (void) ixa_check_drain_insert(connp, ixa); 3587 error = 0; 3588 break; 3589 case EADDRNOTAVAIL: 3590 /* 3591 * IXAF_VERIFY_SOURCE tells us to pick a better source. 3592 * Don't have the application see that errno 3593 */ 3594 error = ENETUNREACH; 3595 /* FALLTHRU */ 3596 default: 3597 mutex_enter(&connp->conn_lock); 3598 /* 3599 * Clear the source and v6lastdst so we call ip_attr_connect 3600 * for the next packet and try to pick a better source. 3601 */ 3602 if (connp->conn_mcbc_bind) 3603 connp->conn_saddr_v6 = ipv6_all_zeros; 3604 else 3605 connp->conn_saddr_v6 = connp->conn_bound_addr_v6; 3606 connp->conn_v6lastdst = ipv6_all_zeros; 3607 mutex_exit(&connp->conn_lock); 3608 break; 3609 } 3610 done: 3611 ASSERT(!(ixa->ixa_free_flags & IXA_FREE_CRED)); 3612 ixa->ixa_cred = connp->conn_cred; /* Restore */ 3613 ixa->ixa_cpid = connp->conn_cpid; 3614 ixa_refrele(ixa); 3615 ip_pkt_free(ipp); 3616 kmem_free(ipp, sizeof (*ipp)); 3617 return (error); 3618 } 3619 3620 /* 3621 * Handle sending an M_DATA for a connected socket. 3622 * Handles both IPv4 and IPv6. 3623 */ 3624 int 3625 icmp_output_connected(conn_t *connp, mblk_t *mp, cred_t *cr, pid_t pid) 3626 { 3627 icmp_t *icmp = connp->conn_icmp; 3628 icmp_stack_t *is = icmp->icmp_is; 3629 int error; 3630 ip_xmit_attr_t *ixa; 3631 boolean_t do_ipsec; 3632 3633 /* 3634 * If no other thread is using conn_ixa this just gets a reference to 3635 * conn_ixa. Otherwise we get a safe copy of conn_ixa. 3636 */ 3637 ixa = conn_get_ixa(connp, B_FALSE); 3638 if (ixa == NULL) { 3639 BUMP_MIB(&is->is_rawip_mib, rawipOutErrors); 3640 freemsg(mp); 3641 return (ENOMEM); 3642 } 3643 3644 ASSERT(cr != NULL); 3645 ASSERT(!(ixa->ixa_free_flags & IXA_FREE_CRED)); 3646 ixa->ixa_cred = cr; 3647 ixa->ixa_cpid = pid; 3648 3649 /* Defer IPsec if it might need to look at ICMP type/code */ 3650 switch (ixa->ixa_protocol) { 3651 case IPPROTO_ICMP: 3652 case IPPROTO_ICMPV6: 3653 do_ipsec = B_FALSE; 3654 break; 3655 default: 3656 do_ipsec = B_TRUE; 3657 } 3658 3659 mutex_enter(&connp->conn_lock); 3660 mp = icmp_prepend_header_template(connp, ixa, mp, 3661 &connp->conn_saddr_v6, connp->conn_flowinfo, &error); 3662 3663 if (mp == NULL) { 3664 ASSERT(error != 0); 3665 mutex_exit(&connp->conn_lock); 3666 ASSERT(!(ixa->ixa_free_flags & IXA_FREE_CRED)); 3667 ixa->ixa_cred = connp->conn_cred; /* Restore */ 3668 ixa->ixa_cpid = connp->conn_cpid; 3669 ixa_refrele(ixa); 3670 BUMP_MIB(&is->is_rawip_mib, rawipOutErrors); 3671 freemsg(mp); 3672 return (error); 3673 } 3674 3675 if (!do_ipsec) { 3676 /* Policy might differ for different ICMP type/code */ 3677 mp = icmp_output_attach_policy(mp, connp, ixa); 3678 if (mp == NULL) { 3679 mutex_exit(&connp->conn_lock); 3680 BUMP_MIB(&is->is_rawip_mib, rawipOutErrors); 3681 ASSERT(!(ixa->ixa_free_flags & IXA_FREE_CRED)); 3682 ixa->ixa_cred = connp->conn_cred; /* Restore */ 3683 ixa->ixa_cpid = connp->conn_cpid; 3684 ixa_refrele(ixa); 3685 return (EHOSTUNREACH); /* IPsec policy failure */ 3686 } 3687 } 3688 3689 /* 3690 * In case we got a safe copy of conn_ixa, or if opt_set made us a new 3691 * safe copy, then we need to fill in any pointers in it. 3692 */ 3693 if (ixa->ixa_ire == NULL) { 3694 in6_addr_t faddr, saddr; 3695 in6_addr_t nexthop; 3696 in_port_t fport; 3697 3698 saddr = connp->conn_saddr_v6; 3699 faddr = connp->conn_faddr_v6; 3700 fport = connp->conn_fport; 3701 ip_attr_nexthop(&connp->conn_xmit_ipp, ixa, &faddr, &nexthop); 3702 mutex_exit(&connp->conn_lock); 3703 3704 error = ip_attr_connect(connp, ixa, &saddr, &faddr, &nexthop, 3705 fport, NULL, NULL, IPDF_ALLOW_MCBC | IPDF_VERIFY_DST | 3706 (do_ipsec ? IPDF_IPSEC : 0)); 3707 switch (error) { 3708 case 0: 3709 break; 3710 case EADDRNOTAVAIL: 3711 /* 3712 * IXAF_VERIFY_SOURCE tells us to pick a better source. 3713 * Don't have the application see that errno 3714 */ 3715 error = ENETUNREACH; 3716 goto failed; 3717 case ENETDOWN: 3718 /* 3719 * Have !ipif_addr_ready address; drop packet silently 3720 * until we can get applications to not send until we 3721 * are ready. 3722 */ 3723 error = 0; 3724 goto failed; 3725 case EHOSTUNREACH: 3726 case ENETUNREACH: 3727 if (ixa->ixa_ire != NULL) { 3728 /* 3729 * Let conn_ip_output/ire_send_noroute return 3730 * the error and send any local ICMP error. 3731 */ 3732 error = 0; 3733 break; 3734 } 3735 /* FALLTHRU */ 3736 default: 3737 failed: 3738 ASSERT(!(ixa->ixa_free_flags & IXA_FREE_CRED)); 3739 ixa->ixa_cred = connp->conn_cred; /* Restore */ 3740 ixa->ixa_cpid = connp->conn_cpid; 3741 ixa_refrele(ixa); 3742 BUMP_MIB(&is->is_rawip_mib, rawipOutErrors); 3743 freemsg(mp); 3744 return (error); 3745 } 3746 } else { 3747 /* Done with conn_t */ 3748 mutex_exit(&connp->conn_lock); 3749 } 3750 3751 /* We're done. Pass the packet to ip. */ 3752 BUMP_MIB(&is->is_rawip_mib, rawipOutDatagrams); 3753 3754 error = conn_ip_output(mp, ixa); 3755 /* No rawipOutErrors if an error since IP increases its error counter */ 3756 switch (error) { 3757 case 0: 3758 break; 3759 case EWOULDBLOCK: 3760 (void) ixa_check_drain_insert(connp, ixa); 3761 error = 0; 3762 break; 3763 case EADDRNOTAVAIL: 3764 /* 3765 * IXAF_VERIFY_SOURCE tells us to pick a better source. 3766 * Don't have the application see that errno 3767 */ 3768 error = ENETUNREACH; 3769 break; 3770 } 3771 ASSERT(!(ixa->ixa_free_flags & IXA_FREE_CRED)); 3772 ixa->ixa_cred = connp->conn_cred; /* Restore */ 3773 ixa->ixa_cpid = connp->conn_cpid; 3774 ixa_refrele(ixa); 3775 return (error); 3776 } 3777 3778 /* 3779 * Handle sending an M_DATA to the last destination. 3780 * Handles both IPv4 and IPv6. 3781 * 3782 * NOTE: The caller must hold conn_lock and we drop it here. 3783 */ 3784 int 3785 icmp_output_lastdst(conn_t *connp, mblk_t *mp, cred_t *cr, pid_t pid, 3786 ip_xmit_attr_t *ixa) 3787 { 3788 icmp_t *icmp = connp->conn_icmp; 3789 icmp_stack_t *is = icmp->icmp_is; 3790 int error; 3791 boolean_t do_ipsec; 3792 3793 ASSERT(MUTEX_HELD(&connp->conn_lock)); 3794 ASSERT(ixa != NULL); 3795 3796 ASSERT(cr != NULL); 3797 ASSERT(!(ixa->ixa_free_flags & IXA_FREE_CRED)); 3798 ixa->ixa_cred = cr; 3799 ixa->ixa_cpid = pid; 3800 3801 /* Defer IPsec if it might need to look at ICMP type/code */ 3802 switch (ixa->ixa_protocol) { 3803 case IPPROTO_ICMP: 3804 case IPPROTO_ICMPV6: 3805 do_ipsec = B_FALSE; 3806 break; 3807 default: 3808 do_ipsec = B_TRUE; 3809 } 3810 3811 3812 mp = icmp_prepend_header_template(connp, ixa, mp, 3813 &connp->conn_v6lastsrc, connp->conn_lastflowinfo, &error); 3814 3815 if (mp == NULL) { 3816 ASSERT(error != 0); 3817 mutex_exit(&connp->conn_lock); 3818 ASSERT(!(ixa->ixa_free_flags & IXA_FREE_CRED)); 3819 ixa->ixa_cred = connp->conn_cred; /* Restore */ 3820 ixa->ixa_cpid = connp->conn_cpid; 3821 ixa_refrele(ixa); 3822 BUMP_MIB(&is->is_rawip_mib, rawipOutErrors); 3823 freemsg(mp); 3824 return (error); 3825 } 3826 3827 if (!do_ipsec) { 3828 /* Policy might differ for different ICMP type/code */ 3829 mp = icmp_output_attach_policy(mp, connp, ixa); 3830 if (mp == NULL) { 3831 mutex_exit(&connp->conn_lock); 3832 BUMP_MIB(&is->is_rawip_mib, rawipOutErrors); 3833 ASSERT(!(ixa->ixa_free_flags & IXA_FREE_CRED)); 3834 ixa->ixa_cred = connp->conn_cred; /* Restore */ 3835 ixa->ixa_cpid = connp->conn_cpid; 3836 ixa_refrele(ixa); 3837 return (EHOSTUNREACH); /* IPsec policy failure */ 3838 } 3839 } 3840 3841 /* 3842 * In case we got a safe copy of conn_ixa, or if opt_set made us a new 3843 * safe copy, then we need to fill in any pointers in it. 3844 */ 3845 if (ixa->ixa_ire == NULL) { 3846 in6_addr_t lastdst, lastsrc; 3847 in6_addr_t nexthop; 3848 in_port_t lastport; 3849 3850 lastsrc = connp->conn_v6lastsrc; 3851 lastdst = connp->conn_v6lastdst; 3852 lastport = connp->conn_lastdstport; 3853 ip_attr_nexthop(&connp->conn_xmit_ipp, ixa, &lastdst, &nexthop); 3854 mutex_exit(&connp->conn_lock); 3855 3856 error = ip_attr_connect(connp, ixa, &lastsrc, &lastdst, 3857 &nexthop, lastport, NULL, NULL, IPDF_ALLOW_MCBC | 3858 IPDF_VERIFY_DST | (do_ipsec ? IPDF_IPSEC : 0)); 3859 switch (error) { 3860 case 0: 3861 break; 3862 case EADDRNOTAVAIL: 3863 /* 3864 * IXAF_VERIFY_SOURCE tells us to pick a better source. 3865 * Don't have the application see that errno 3866 */ 3867 error = ENETUNREACH; 3868 goto failed; 3869 case ENETDOWN: 3870 /* 3871 * Have !ipif_addr_ready address; drop packet silently 3872 * until we can get applications to not send until we 3873 * are ready. 3874 */ 3875 error = 0; 3876 goto failed; 3877 case EHOSTUNREACH: 3878 case ENETUNREACH: 3879 if (ixa->ixa_ire != NULL) { 3880 /* 3881 * Let conn_ip_output/ire_send_noroute return 3882 * the error and send any local ICMP error. 3883 */ 3884 error = 0; 3885 break; 3886 } 3887 /* FALLTHRU */ 3888 default: 3889 failed: 3890 ASSERT(!(ixa->ixa_free_flags & IXA_FREE_CRED)); 3891 ixa->ixa_cred = connp->conn_cred; /* Restore */ 3892 ixa->ixa_cpid = connp->conn_cpid; 3893 ixa_refrele(ixa); 3894 BUMP_MIB(&is->is_rawip_mib, rawipOutErrors); 3895 freemsg(mp); 3896 return (error); 3897 } 3898 } else { 3899 /* Done with conn_t */ 3900 mutex_exit(&connp->conn_lock); 3901 } 3902 3903 /* We're done. Pass the packet to ip. */ 3904 BUMP_MIB(&is->is_rawip_mib, rawipOutDatagrams); 3905 error = conn_ip_output(mp, ixa); 3906 /* No rawipOutErrors if an error since IP increases its error counter */ 3907 switch (error) { 3908 case 0: 3909 break; 3910 case EWOULDBLOCK: 3911 (void) ixa_check_drain_insert(connp, ixa); 3912 error = 0; 3913 break; 3914 case EADDRNOTAVAIL: 3915 /* 3916 * IXAF_VERIFY_SOURCE tells us to pick a better source. 3917 * Don't have the application see that errno 3918 */ 3919 error = ENETUNREACH; 3920 /* FALLTHRU */ 3921 default: 3922 mutex_enter(&connp->conn_lock); 3923 /* 3924 * Clear the source and v6lastdst so we call ip_attr_connect 3925 * for the next packet and try to pick a better source. 3926 */ 3927 if (connp->conn_mcbc_bind) 3928 connp->conn_saddr_v6 = ipv6_all_zeros; 3929 else 3930 connp->conn_saddr_v6 = connp->conn_bound_addr_v6; 3931 connp->conn_v6lastdst = ipv6_all_zeros; 3932 mutex_exit(&connp->conn_lock); 3933 break; 3934 } 3935 ASSERT(!(ixa->ixa_free_flags & IXA_FREE_CRED)); 3936 ixa->ixa_cred = connp->conn_cred; /* Restore */ 3937 ixa->ixa_cpid = connp->conn_cpid; 3938 ixa_refrele(ixa); 3939 return (error); 3940 } 3941 3942 3943 /* 3944 * Prepend the header template and then fill in the source and 3945 * flowinfo. The caller needs to handle the destination address since 3946 * it's setting is different if rthdr or source route. 3947 * 3948 * Returns NULL is allocation failed or if the packet would exceed IP_MAXPACKET. 3949 * When it returns NULL it sets errorp. 3950 */ 3951 static mblk_t * 3952 icmp_prepend_header_template(conn_t *connp, ip_xmit_attr_t *ixa, mblk_t *mp, 3953 const in6_addr_t *v6src, uint32_t flowinfo, int *errorp) 3954 { 3955 icmp_t *icmp = connp->conn_icmp; 3956 icmp_stack_t *is = icmp->icmp_is; 3957 uint_t pktlen; 3958 uint_t copylen; 3959 uint8_t *iph; 3960 uint_t ip_hdr_length; 3961 uint32_t cksum; 3962 ip_pkt_t *ipp; 3963 3964 ASSERT(MUTEX_HELD(&connp->conn_lock)); 3965 3966 /* 3967 * Copy the header template. 3968 */ 3969 copylen = connp->conn_ht_iphc_len; 3970 pktlen = copylen + msgdsize(mp); 3971 if (pktlen > IP_MAXPACKET) { 3972 freemsg(mp); 3973 *errorp = EMSGSIZE; 3974 return (NULL); 3975 } 3976 ixa->ixa_pktlen = pktlen; 3977 3978 /* check/fix buffer config, setup pointers into it */ 3979 iph = mp->b_rptr - copylen; 3980 if (DB_REF(mp) != 1 || iph < DB_BASE(mp) || !OK_32PTR(iph)) { 3981 mblk_t *mp1; 3982 3983 mp1 = allocb(copylen + is->is_wroff_extra, BPRI_MED); 3984 if (mp1 == NULL) { 3985 freemsg(mp); 3986 *errorp = ENOMEM; 3987 return (NULL); 3988 } 3989 mp1->b_wptr = DB_LIM(mp1); 3990 mp1->b_cont = mp; 3991 mp = mp1; 3992 iph = (mp->b_wptr - copylen); 3993 } 3994 mp->b_rptr = iph; 3995 bcopy(connp->conn_ht_iphc, iph, copylen); 3996 ip_hdr_length = (uint_t)(connp->conn_ht_ulp - connp->conn_ht_iphc); 3997 3998 ixa->ixa_ip_hdr_length = ip_hdr_length; 3999 4000 /* 4001 * Prepare for ICMPv6 checksum done in IP. 4002 * 4003 * icmp_build_hdr_template has already massaged any routing header 4004 * and placed the result in conn_sum. 4005 * 4006 * We make it easy for IP to include our pseudo header 4007 * by putting our length (and any routing header adjustment) 4008 * in the ICMPv6 checksum field. 4009 */ 4010 cksum = pktlen - ip_hdr_length; 4011 4012 cksum += connp->conn_sum; 4013 cksum = (cksum >> 16) + (cksum & 0xFFFF); 4014 ASSERT(cksum < 0x10000); 4015 4016 ipp = &connp->conn_xmit_ipp; 4017 if (ixa->ixa_flags & IXAF_IS_IPV4) { 4018 ipha_t *ipha = (ipha_t *)iph; 4019 4020 ipha->ipha_length = htons((uint16_t)pktlen); 4021 4022 /* if IP_PKTINFO specified an addres it wins over bind() */ 4023 if ((ipp->ipp_fields & IPPF_ADDR) && 4024 IN6_IS_ADDR_V4MAPPED(&ipp->ipp_addr)) { 4025 ASSERT(ipp->ipp_addr_v4 != INADDR_ANY); 4026 ipha->ipha_src = ipp->ipp_addr_v4; 4027 } else { 4028 IN6_V4MAPPED_TO_IPADDR(v6src, ipha->ipha_src); 4029 } 4030 } else { 4031 ip6_t *ip6h = (ip6_t *)iph; 4032 uint_t cksum_offset = 0; 4033 4034 ip6h->ip6_plen = htons((uint16_t)(pktlen - IPV6_HDR_LEN)); 4035 4036 /* if IP_PKTINFO specified an addres it wins over bind() */ 4037 if ((ipp->ipp_fields & IPPF_ADDR) && 4038 !IN6_IS_ADDR_V4MAPPED(&ipp->ipp_addr)) { 4039 ASSERT(!IN6_IS_ADDR_UNSPECIFIED(&ipp->ipp_addr)); 4040 ip6h->ip6_src = ipp->ipp_addr; 4041 } else { 4042 ip6h->ip6_src = *v6src; 4043 } 4044 ip6h->ip6_vcf = 4045 (IPV6_DEFAULT_VERS_AND_FLOW & IPV6_VERS_AND_FLOW_MASK) | 4046 (flowinfo & ~IPV6_VERS_AND_FLOW_MASK); 4047 if (ipp->ipp_fields & IPPF_TCLASS) { 4048 /* Overrides the class part of flowinfo */ 4049 ip6h->ip6_vcf = IPV6_TCLASS_FLOW(ip6h->ip6_vcf, 4050 ipp->ipp_tclass); 4051 } 4052 4053 if (ixa->ixa_flags & IXAF_SET_ULP_CKSUM) { 4054 if (connp->conn_proto == IPPROTO_ICMPV6) { 4055 cksum_offset = ixa->ixa_ip_hdr_length + 4056 offsetof(icmp6_t, icmp6_cksum); 4057 } else if (ixa->ixa_flags & IXAF_SET_RAW_CKSUM) { 4058 cksum_offset = ixa->ixa_ip_hdr_length + 4059 ixa->ixa_raw_cksum_offset; 4060 } 4061 } 4062 if (cksum_offset != 0) { 4063 uint16_t *ptr; 4064 4065 /* Make sure the checksum fits in the first mblk */ 4066 if (cksum_offset + sizeof (short) > MBLKL(mp)) { 4067 mblk_t *mp1; 4068 4069 mp1 = msgpullup(mp, 4070 cksum_offset + sizeof (short)); 4071 freemsg(mp); 4072 if (mp1 == NULL) { 4073 *errorp = ENOMEM; 4074 return (NULL); 4075 } 4076 mp = mp1; 4077 iph = mp->b_rptr; 4078 ip6h = (ip6_t *)iph; 4079 } 4080 ptr = (uint16_t *)(mp->b_rptr + cksum_offset); 4081 *ptr = htons(cksum); 4082 } 4083 } 4084 4085 return (mp); 4086 } 4087 4088 /* 4089 * This routine handles all messages passed downstream. It either 4090 * consumes the message or passes it downstream; it never queues a 4091 * a message. 4092 */ 4093 void 4094 icmp_wput(queue_t *q, mblk_t *mp) 4095 { 4096 sin6_t *sin6; 4097 sin_t *sin = NULL; 4098 uint_t srcid; 4099 conn_t *connp = Q_TO_CONN(q); 4100 icmp_t *icmp = connp->conn_icmp; 4101 int error = 0; 4102 struct sockaddr *addr = NULL; 4103 socklen_t addrlen; 4104 icmp_stack_t *is = icmp->icmp_is; 4105 struct T_unitdata_req *tudr; 4106 mblk_t *data_mp; 4107 cred_t *cr; 4108 pid_t pid; 4109 4110 /* 4111 * We directly handle several cases here: T_UNITDATA_REQ message 4112 * coming down as M_PROTO/M_PCPROTO and M_DATA messages for connected 4113 * socket. 4114 */ 4115 switch (DB_TYPE(mp)) { 4116 case M_DATA: 4117 /* sockfs never sends down M_DATA */ 4118 BUMP_MIB(&is->is_rawip_mib, rawipOutErrors); 4119 freemsg(mp); 4120 return; 4121 4122 case M_PROTO: 4123 case M_PCPROTO: 4124 tudr = (struct T_unitdata_req *)mp->b_rptr; 4125 if (MBLKL(mp) < sizeof (*tudr) || 4126 ((t_primp_t)mp->b_rptr)->type != T_UNITDATA_REQ) { 4127 icmp_wput_other(q, mp); 4128 return; 4129 } 4130 break; 4131 4132 default: 4133 icmp_wput_other(q, mp); 4134 return; 4135 } 4136 4137 /* Handle valid T_UNITDATA_REQ here */ 4138 data_mp = mp->b_cont; 4139 if (data_mp == NULL) { 4140 error = EPROTO; 4141 goto ud_error2; 4142 } 4143 mp->b_cont = NULL; 4144 4145 if (!MBLKIN(mp, 0, tudr->DEST_offset + tudr->DEST_length)) { 4146 error = EADDRNOTAVAIL; 4147 goto ud_error2; 4148 } 4149 4150 /* 4151 * All Solaris components should pass a db_credp 4152 * for this message, hence we ASSERT. 4153 * On production kernels we return an error to be robust against 4154 * random streams modules sitting on top of us. 4155 */ 4156 cr = msg_getcred(mp, &pid); 4157 ASSERT(cr != NULL); 4158 if (cr == NULL) { 4159 error = EINVAL; 4160 goto ud_error2; 4161 } 4162 4163 /* 4164 * If a port has not been bound to the stream, fail. 4165 * This is not a problem when sockfs is directly 4166 * above us, because it will ensure that the socket 4167 * is first bound before allowing data to be sent. 4168 */ 4169 if (icmp->icmp_state == TS_UNBND) { 4170 error = EPROTO; 4171 goto ud_error2; 4172 } 4173 addr = (struct sockaddr *)&mp->b_rptr[tudr->DEST_offset]; 4174 addrlen = tudr->DEST_length; 4175 4176 switch (connp->conn_family) { 4177 case AF_INET6: 4178 sin6 = (sin6_t *)addr; 4179 if (!OK_32PTR((char *)sin6) || (addrlen != sizeof (sin6_t)) || 4180 (sin6->sin6_family != AF_INET6)) { 4181 error = EADDRNOTAVAIL; 4182 goto ud_error2; 4183 } 4184 4185 /* No support for mapped addresses on raw sockets */ 4186 if (IN6_IS_ADDR_V4MAPPED(&sin6->sin6_addr)) { 4187 error = EADDRNOTAVAIL; 4188 goto ud_error2; 4189 } 4190 srcid = sin6->__sin6_src_id; 4191 4192 /* 4193 * If the local address is a mapped address return 4194 * an error. 4195 * It would be possible to send an IPv6 packet but the 4196 * response would never make it back to the application 4197 * since it is bound to a mapped address. 4198 */ 4199 if (IN6_IS_ADDR_V4MAPPED(&connp->conn_saddr_v6)) { 4200 error = EADDRNOTAVAIL; 4201 goto ud_error2; 4202 } 4203 4204 if (IN6_IS_ADDR_UNSPECIFIED(&sin6->sin6_addr)) 4205 sin6->sin6_addr = ipv6_loopback; 4206 4207 if (tudr->OPT_length != 0) { 4208 /* 4209 * If we are connected then the destination needs to be 4210 * the same as the connected one. 4211 */ 4212 if (icmp->icmp_state == TS_DATA_XFER && 4213 !conn_same_as_last_v6(connp, sin6)) { 4214 error = EISCONN; 4215 goto ud_error2; 4216 } 4217 error = icmp_output_ancillary(connp, NULL, sin6, 4218 data_mp, mp, NULL, cr, pid); 4219 } else { 4220 ip_xmit_attr_t *ixa; 4221 4222 /* 4223 * We have to allocate an ip_xmit_attr_t before we grab 4224 * conn_lock and we need to hold conn_lock once we've 4225 * checked conn_same_as_last_v6 to handle concurrent 4226 * send* calls on a socket. 4227 */ 4228 ixa = conn_get_ixa(connp, B_FALSE); 4229 if (ixa == NULL) { 4230 error = ENOMEM; 4231 goto ud_error2; 4232 } 4233 mutex_enter(&connp->conn_lock); 4234 4235 if (conn_same_as_last_v6(connp, sin6) && 4236 connp->conn_lastsrcid == srcid && 4237 ipsec_outbound_policy_current(ixa)) { 4238 /* icmp_output_lastdst drops conn_lock */ 4239 error = icmp_output_lastdst(connp, data_mp, cr, 4240 pid, ixa); 4241 } else { 4242 /* icmp_output_newdst drops conn_lock */ 4243 error = icmp_output_newdst(connp, data_mp, NULL, 4244 sin6, cr, pid, ixa); 4245 } 4246 ASSERT(MUTEX_NOT_HELD(&connp->conn_lock)); 4247 } 4248 if (error == 0) { 4249 freeb(mp); 4250 return; 4251 } 4252 break; 4253 4254 case AF_INET: 4255 sin = (sin_t *)addr; 4256 if ((!OK_32PTR((char *)sin) || addrlen != sizeof (sin_t)) || 4257 (sin->sin_family != AF_INET)) { 4258 error = EADDRNOTAVAIL; 4259 goto ud_error2; 4260 } 4261 if (sin->sin_addr.s_addr == INADDR_ANY) 4262 sin->sin_addr.s_addr = htonl(INADDR_LOOPBACK); 4263 4264 /* Protocol 255 contains full IP headers */ 4265 /* Read without holding lock */ 4266 if (icmp->icmp_hdrincl) { 4267 if (MBLKL(data_mp) < IP_SIMPLE_HDR_LENGTH) { 4268 if (!pullupmsg(data_mp, IP_SIMPLE_HDR_LENGTH)) { 4269 error = EINVAL; 4270 goto ud_error2; 4271 } 4272 } 4273 error = icmp_output_hdrincl(connp, data_mp, cr, pid); 4274 if (error == 0) { 4275 freeb(mp); 4276 return; 4277 } 4278 /* data_mp consumed above */ 4279 data_mp = NULL; 4280 goto ud_error2; 4281 } 4282 4283 if (tudr->OPT_length != 0) { 4284 /* 4285 * If we are connected then the destination needs to be 4286 * the same as the connected one. 4287 */ 4288 if (icmp->icmp_state == TS_DATA_XFER && 4289 !conn_same_as_last_v4(connp, sin)) { 4290 error = EISCONN; 4291 goto ud_error2; 4292 } 4293 error = icmp_output_ancillary(connp, sin, NULL, 4294 data_mp, mp, NULL, cr, pid); 4295 } else { 4296 ip_xmit_attr_t *ixa; 4297 4298 /* 4299 * We have to allocate an ip_xmit_attr_t before we grab 4300 * conn_lock and we need to hold conn_lock once we've 4301 * checked conn_same_as_last_v4 to handle concurrent 4302 * send* calls on a socket. 4303 */ 4304 ixa = conn_get_ixa(connp, B_FALSE); 4305 if (ixa == NULL) { 4306 error = ENOMEM; 4307 goto ud_error2; 4308 } 4309 mutex_enter(&connp->conn_lock); 4310 4311 if (conn_same_as_last_v4(connp, sin) && 4312 ipsec_outbound_policy_current(ixa)) { 4313 /* icmp_output_lastdst drops conn_lock */ 4314 error = icmp_output_lastdst(connp, data_mp, cr, 4315 pid, ixa); 4316 } else { 4317 /* icmp_output_newdst drops conn_lock */ 4318 error = icmp_output_newdst(connp, data_mp, sin, 4319 NULL, cr, pid, ixa); 4320 } 4321 ASSERT(MUTEX_NOT_HELD(&connp->conn_lock)); 4322 } 4323 if (error == 0) { 4324 freeb(mp); 4325 return; 4326 } 4327 break; 4328 } 4329 ASSERT(mp != NULL); 4330 /* mp is freed by the following routine */ 4331 icmp_ud_err(q, mp, (t_scalar_t)error); 4332 return; 4333 4334 ud_error2: 4335 BUMP_MIB(&is->is_rawip_mib, rawipOutErrors); 4336 freemsg(data_mp); 4337 ASSERT(mp != NULL); 4338 /* mp is freed by the following routine */ 4339 icmp_ud_err(q, mp, (t_scalar_t)error); 4340 } 4341 4342 /* 4343 * Handle the case of the IP address or flow label being different 4344 * for both IPv4 and IPv6. 4345 * 4346 * NOTE: The caller must hold conn_lock and we drop it here. 4347 */ 4348 static int 4349 icmp_output_newdst(conn_t *connp, mblk_t *data_mp, sin_t *sin, sin6_t *sin6, 4350 cred_t *cr, pid_t pid, ip_xmit_attr_t *ixa) 4351 { 4352 icmp_t *icmp = connp->conn_icmp; 4353 icmp_stack_t *is = icmp->icmp_is; 4354 int error; 4355 ip_xmit_attr_t *oldixa; 4356 boolean_t do_ipsec; 4357 uint_t srcid; 4358 uint32_t flowinfo; 4359 in6_addr_t v6src; 4360 in6_addr_t v6dst; 4361 in6_addr_t v6nexthop; 4362 in_port_t dstport; 4363 4364 ASSERT(MUTEX_HELD(&connp->conn_lock)); 4365 ASSERT(ixa != NULL); 4366 4367 /* 4368 * We hold conn_lock across all the use and modifications of 4369 * the conn_lastdst, conn_ixa, and conn_xmit_ipp to ensure that they 4370 * stay consistent. 4371 */ 4372 4373 ASSERT(cr != NULL); 4374 ASSERT(!(ixa->ixa_free_flags & IXA_FREE_CRED)); 4375 ixa->ixa_cred = cr; 4376 ixa->ixa_cpid = pid; 4377 if (is_system_labeled()) { 4378 /* We need to restart with a label based on the cred */ 4379 ip_xmit_attr_restore_tsl(ixa, ixa->ixa_cred); 4380 } 4381 /* 4382 * If we are connected then the destination needs to be the 4383 * same as the connected one, which is not the case here since we 4384 * checked for that above. 4385 */ 4386 if (icmp->icmp_state == TS_DATA_XFER) { 4387 mutex_exit(&connp->conn_lock); 4388 error = EISCONN; 4389 goto ud_error; 4390 } 4391 4392 /* In case previous destination was multicast or multirt */ 4393 ip_attr_newdst(ixa); 4394 4395 /* 4396 * If laddr is unspecified then we look at sin6_src_id. 4397 * We will give precedence to a source address set with IPV6_PKTINFO 4398 * (aka IPPF_ADDR) but that is handled in build_hdrs. However, we don't 4399 * want ip_attr_connect to select a source (since it can fail) when 4400 * IPV6_PKTINFO is specified. 4401 * If this doesn't result in a source address then we get a source 4402 * from ip_attr_connect() below. 4403 */ 4404 v6src = connp->conn_saddr_v6; 4405 if (sin != NULL) { 4406 IN6_IPADDR_TO_V4MAPPED(sin->sin_addr.s_addr, &v6dst); 4407 dstport = sin->sin_port; 4408 flowinfo = 0; 4409 srcid = 0; 4410 ixa->ixa_flags &= ~IXAF_SCOPEID_SET; 4411 if (srcid != 0 && V4_PART_OF_V6(&v6src) == INADDR_ANY) { 4412 ip_srcid_find_id(srcid, &v6src, IPCL_ZONEID(connp), 4413 connp->conn_netstack); 4414 } 4415 ixa->ixa_flags |= IXAF_IS_IPV4; 4416 } else { 4417 v6dst = sin6->sin6_addr; 4418 dstport = sin6->sin6_port; 4419 flowinfo = sin6->sin6_flowinfo; 4420 srcid = sin6->__sin6_src_id; 4421 if (IN6_IS_ADDR_LINKSCOPE(&v6dst) && sin6->sin6_scope_id != 0) { 4422 ixa->ixa_scopeid = sin6->sin6_scope_id; 4423 ixa->ixa_flags |= IXAF_SCOPEID_SET; 4424 } else { 4425 ixa->ixa_flags &= ~IXAF_SCOPEID_SET; 4426 } 4427 if (srcid != 0 && IN6_IS_ADDR_UNSPECIFIED(&v6src)) { 4428 ip_srcid_find_id(srcid, &v6src, IPCL_ZONEID(connp), 4429 connp->conn_netstack); 4430 } 4431 if (IN6_IS_ADDR_V4MAPPED(&v6dst)) 4432 ixa->ixa_flags |= IXAF_IS_IPV4; 4433 else 4434 ixa->ixa_flags &= ~IXAF_IS_IPV4; 4435 } 4436 /* Handle IP_PKTINFO/IPV6_PKTINFO setting source address. */ 4437 if (connp->conn_xmit_ipp.ipp_fields & IPPF_ADDR) { 4438 ip_pkt_t *ipp = &connp->conn_xmit_ipp; 4439 4440 if (ixa->ixa_flags & IXAF_IS_IPV4) { 4441 if (IN6_IS_ADDR_V4MAPPED(&ipp->ipp_addr)) 4442 v6src = ipp->ipp_addr; 4443 } else { 4444 if (!IN6_IS_ADDR_V4MAPPED(&ipp->ipp_addr)) 4445 v6src = ipp->ipp_addr; 4446 } 4447 } 4448 4449 /* Defer IPsec if it might need to look at ICMP type/code */ 4450 switch (ixa->ixa_protocol) { 4451 case IPPROTO_ICMP: 4452 case IPPROTO_ICMPV6: 4453 do_ipsec = B_FALSE; 4454 break; 4455 default: 4456 do_ipsec = B_TRUE; 4457 } 4458 4459 ip_attr_nexthop(&connp->conn_xmit_ipp, ixa, &v6dst, &v6nexthop); 4460 mutex_exit(&connp->conn_lock); 4461 4462 error = ip_attr_connect(connp, ixa, &v6src, &v6dst, &v6nexthop, dstport, 4463 &v6src, NULL, IPDF_ALLOW_MCBC | IPDF_VERIFY_DST | 4464 (do_ipsec ? IPDF_IPSEC : 0)); 4465 switch (error) { 4466 case 0: 4467 break; 4468 case EADDRNOTAVAIL: 4469 /* 4470 * IXAF_VERIFY_SOURCE tells us to pick a better source. 4471 * Don't have the application see that errno 4472 */ 4473 error = ENETUNREACH; 4474 goto failed; 4475 case ENETDOWN: 4476 /* 4477 * Have !ipif_addr_ready address; drop packet silently 4478 * until we can get applications to not send until we 4479 * are ready. 4480 */ 4481 error = 0; 4482 goto failed; 4483 case EHOSTUNREACH: 4484 case ENETUNREACH: 4485 if (ixa->ixa_ire != NULL) { 4486 /* 4487 * Let conn_ip_output/ire_send_noroute return 4488 * the error and send any local ICMP error. 4489 */ 4490 error = 0; 4491 break; 4492 } 4493 /* FALLTHRU */ 4494 default: 4495 failed: 4496 goto ud_error; 4497 } 4498 4499 mutex_enter(&connp->conn_lock); 4500 /* 4501 * While we dropped the lock some other thread might have connected 4502 * this socket. If so we bail out with EISCONN to ensure that the 4503 * connecting thread is the one that updates conn_ixa, conn_ht_* 4504 * and conn_*last*. 4505 */ 4506 if (icmp->icmp_state == TS_DATA_XFER) { 4507 mutex_exit(&connp->conn_lock); 4508 error = EISCONN; 4509 goto ud_error; 4510 } 4511 4512 /* 4513 * We need to rebuild the headers if 4514 * - we are labeling packets (could be different for different 4515 * destinations) 4516 * - we have a source route (or routing header) since we need to 4517 * massage that to get the pseudo-header checksum 4518 * - a socket option with COA_HEADER_CHANGED has been set which 4519 * set conn_v6lastdst to zero. 4520 * 4521 * Otherwise the prepend function will just update the src, dst, 4522 * and flow label. 4523 */ 4524 if (is_system_labeled()) { 4525 /* TX MLP requires SCM_UCRED and don't have that here */ 4526 if (connp->conn_mlp_type != mlptSingle) { 4527 mutex_exit(&connp->conn_lock); 4528 error = ECONNREFUSED; 4529 goto ud_error; 4530 } 4531 /* 4532 * Check whether Trusted Solaris policy allows communication 4533 * with this host, and pretend that the destination is 4534 * unreachable if not. 4535 * Compute any needed label and place it in ipp_label_v4/v6. 4536 * 4537 * Later conn_build_hdr_template/conn_prepend_hdr takes 4538 * ipp_label_v4/v6 to form the packet. 4539 * 4540 * Tsol note: Since we hold conn_lock we know no other 4541 * thread manipulates conn_xmit_ipp. 4542 */ 4543 error = conn_update_label(connp, ixa, &v6dst, 4544 &connp->conn_xmit_ipp); 4545 if (error != 0) { 4546 mutex_exit(&connp->conn_lock); 4547 goto ud_error; 4548 } 4549 /* Rebuild the header template */ 4550 error = icmp_build_hdr_template(connp, &v6src, &v6dst, 4551 flowinfo); 4552 if (error != 0) { 4553 mutex_exit(&connp->conn_lock); 4554 goto ud_error; 4555 } 4556 } else if (connp->conn_xmit_ipp.ipp_fields & 4557 (IPPF_IPV4_OPTIONS|IPPF_RTHDR) || 4558 IN6_IS_ADDR_UNSPECIFIED(&connp->conn_v6lastdst)) { 4559 /* Rebuild the header template */ 4560 error = icmp_build_hdr_template(connp, &v6src, &v6dst, 4561 flowinfo); 4562 if (error != 0) { 4563 mutex_exit(&connp->conn_lock); 4564 goto ud_error; 4565 } 4566 } else { 4567 /* Simply update the destination address if no source route */ 4568 if (ixa->ixa_flags & IXAF_IS_IPV4) { 4569 ipha_t *ipha = (ipha_t *)connp->conn_ht_iphc; 4570 4571 IN6_V4MAPPED_TO_IPADDR(&v6dst, ipha->ipha_dst); 4572 if (ixa->ixa_flags & IXAF_PMTU_IPV4_DF) { 4573 ipha->ipha_fragment_offset_and_flags |= 4574 IPH_DF_HTONS; 4575 } else { 4576 ipha->ipha_fragment_offset_and_flags &= 4577 ~IPH_DF_HTONS; 4578 } 4579 } else { 4580 ip6_t *ip6h = (ip6_t *)connp->conn_ht_iphc; 4581 ip6h->ip6_dst = v6dst; 4582 } 4583 } 4584 4585 /* 4586 * Remember the dst etc which corresponds to the built header 4587 * template and conn_ixa. 4588 */ 4589 oldixa = conn_replace_ixa(connp, ixa); 4590 connp->conn_v6lastdst = v6dst; 4591 connp->conn_lastflowinfo = flowinfo; 4592 connp->conn_lastscopeid = ixa->ixa_scopeid; 4593 connp->conn_lastsrcid = srcid; 4594 /* Also remember a source to use together with lastdst */ 4595 connp->conn_v6lastsrc = v6src; 4596 4597 data_mp = icmp_prepend_header_template(connp, ixa, data_mp, &v6src, 4598 flowinfo, &error); 4599 4600 /* Done with conn_t */ 4601 mutex_exit(&connp->conn_lock); 4602 ixa_refrele(oldixa); 4603 4604 if (data_mp == NULL) { 4605 ASSERT(error != 0); 4606 goto ud_error; 4607 } 4608 4609 if (!do_ipsec) { 4610 /* Policy might differ for different ICMP type/code */ 4611 data_mp = icmp_output_attach_policy(data_mp, connp, ixa); 4612 if (data_mp == NULL) { 4613 BUMP_MIB(&is->is_rawip_mib, rawipOutErrors); 4614 error = EHOSTUNREACH; /* IPsec policy failure */ 4615 goto done; 4616 } 4617 } 4618 4619 /* We're done. Pass the packet to ip. */ 4620 BUMP_MIB(&is->is_rawip_mib, rawipOutDatagrams); 4621 4622 error = conn_ip_output(data_mp, ixa); 4623 /* No rawipOutErrors if an error since IP increases its error counter */ 4624 switch (error) { 4625 case 0: 4626 break; 4627 case EWOULDBLOCK: 4628 (void) ixa_check_drain_insert(connp, ixa); 4629 error = 0; 4630 break; 4631 case EADDRNOTAVAIL: 4632 /* 4633 * IXAF_VERIFY_SOURCE tells us to pick a better source. 4634 * Don't have the application see that errno 4635 */ 4636 error = ENETUNREACH; 4637 /* FALLTHRU */ 4638 default: 4639 mutex_enter(&connp->conn_lock); 4640 /* 4641 * Clear the source and v6lastdst so we call ip_attr_connect 4642 * for the next packet and try to pick a better source. 4643 */ 4644 if (connp->conn_mcbc_bind) 4645 connp->conn_saddr_v6 = ipv6_all_zeros; 4646 else 4647 connp->conn_saddr_v6 = connp->conn_bound_addr_v6; 4648 connp->conn_v6lastdst = ipv6_all_zeros; 4649 mutex_exit(&connp->conn_lock); 4650 break; 4651 } 4652 done: 4653 ASSERT(!(ixa->ixa_free_flags & IXA_FREE_CRED)); 4654 ixa->ixa_cred = connp->conn_cred; /* Restore */ 4655 ixa->ixa_cpid = connp->conn_cpid; 4656 ixa_refrele(ixa); 4657 return (error); 4658 4659 ud_error: 4660 ASSERT(!(ixa->ixa_free_flags & IXA_FREE_CRED)); 4661 ixa->ixa_cred = connp->conn_cred; /* Restore */ 4662 ixa->ixa_cpid = connp->conn_cpid; 4663 ixa_refrele(ixa); 4664 4665 BUMP_MIB(&is->is_rawip_mib, rawipOutErrors); 4666 freemsg(data_mp); 4667 return (error); 4668 } 4669 4670 /* ARGSUSED */ 4671 static void 4672 icmp_wput_fallback(queue_t *q, mblk_t *mp) 4673 { 4674 #ifdef DEBUG 4675 cmn_err(CE_CONT, "icmp_wput_fallback: Message during fallback \n"); 4676 #endif 4677 freemsg(mp); 4678 } 4679 4680 static void 4681 icmp_wput_other(queue_t *q, mblk_t *mp) 4682 { 4683 uchar_t *rptr = mp->b_rptr; 4684 struct iocblk *iocp; 4685 conn_t *connp = Q_TO_CONN(q); 4686 icmp_t *icmp = connp->conn_icmp; 4687 cred_t *cr; 4688 4689 switch (mp->b_datap->db_type) { 4690 case M_PROTO: 4691 case M_PCPROTO: 4692 if (mp->b_wptr - rptr < sizeof (t_scalar_t)) { 4693 /* 4694 * If the message does not contain a PRIM_type, 4695 * throw it away. 4696 */ 4697 freemsg(mp); 4698 return; 4699 } 4700 switch (((t_primp_t)rptr)->type) { 4701 case T_ADDR_REQ: 4702 icmp_addr_req(q, mp); 4703 return; 4704 case O_T_BIND_REQ: 4705 case T_BIND_REQ: 4706 icmp_tpi_bind(q, mp); 4707 return; 4708 case T_CONN_REQ: 4709 icmp_tpi_connect(q, mp); 4710 return; 4711 case T_CAPABILITY_REQ: 4712 icmp_capability_req(q, mp); 4713 return; 4714 case T_INFO_REQ: 4715 icmp_info_req(q, mp); 4716 return; 4717 case T_UNITDATA_REQ: 4718 /* 4719 * If a T_UNITDATA_REQ gets here, the address must 4720 * be bad. Valid T_UNITDATA_REQs are handled 4721 * in icmp_wput. 4722 */ 4723 icmp_ud_err(q, mp, EADDRNOTAVAIL); 4724 return; 4725 case T_UNBIND_REQ: 4726 icmp_tpi_unbind(q, mp); 4727 return; 4728 case T_SVR4_OPTMGMT_REQ: 4729 /* 4730 * All Solaris components should pass a db_credp 4731 * for this TPI message, hence we ASSERT. 4732 * But in case there is some other M_PROTO that looks 4733 * like a TPI message sent by some other kernel 4734 * component, we check and return an error. 4735 */ 4736 cr = msg_getcred(mp, NULL); 4737 ASSERT(cr != NULL); 4738 if (cr == NULL) { 4739 icmp_err_ack(q, mp, TSYSERR, EINVAL); 4740 return; 4741 } 4742 4743 if (!snmpcom_req(q, mp, icmp_snmp_set, ip_snmp_get, 4744 cr)) { 4745 svr4_optcom_req(q, mp, cr, &icmp_opt_obj); 4746 } 4747 return; 4748 4749 case T_OPTMGMT_REQ: 4750 /* 4751 * All Solaris components should pass a db_credp 4752 * for this TPI message, hence we ASSERT. 4753 * But in case there is some other M_PROTO that looks 4754 * like a TPI message sent by some other kernel 4755 * component, we check and return an error. 4756 */ 4757 cr = msg_getcred(mp, NULL); 4758 ASSERT(cr != NULL); 4759 if (cr == NULL) { 4760 icmp_err_ack(q, mp, TSYSERR, EINVAL); 4761 return; 4762 } 4763 tpi_optcom_req(q, mp, cr, &icmp_opt_obj); 4764 return; 4765 4766 case T_DISCON_REQ: 4767 icmp_tpi_disconnect(q, mp); 4768 return; 4769 4770 /* The following TPI message is not supported by icmp. */ 4771 case O_T_CONN_RES: 4772 case T_CONN_RES: 4773 icmp_err_ack(q, mp, TNOTSUPPORT, 0); 4774 return; 4775 4776 /* The following 3 TPI requests are illegal for icmp. */ 4777 case T_DATA_REQ: 4778 case T_EXDATA_REQ: 4779 case T_ORDREL_REQ: 4780 icmp_err_ack(q, mp, TNOTSUPPORT, 0); 4781 return; 4782 default: 4783 break; 4784 } 4785 break; 4786 case M_FLUSH: 4787 if (*rptr & FLUSHW) 4788 flushq(q, FLUSHDATA); 4789 break; 4790 case M_IOCTL: 4791 iocp = (struct iocblk *)mp->b_rptr; 4792 switch (iocp->ioc_cmd) { 4793 case TI_GETPEERNAME: 4794 if (icmp->icmp_state != TS_DATA_XFER) { 4795 /* 4796 * If a default destination address has not 4797 * been associated with the stream, then we 4798 * don't know the peer's name. 4799 */ 4800 iocp->ioc_error = ENOTCONN; 4801 iocp->ioc_count = 0; 4802 mp->b_datap->db_type = M_IOCACK; 4803 qreply(q, mp); 4804 return; 4805 } 4806 /* FALLTHRU */ 4807 case TI_GETMYNAME: 4808 /* 4809 * For TI_GETPEERNAME and TI_GETMYNAME, we first 4810 * need to copyin the user's strbuf structure. 4811 * Processing will continue in the M_IOCDATA case 4812 * below. 4813 */ 4814 mi_copyin(q, mp, NULL, 4815 SIZEOF_STRUCT(strbuf, iocp->ioc_flag)); 4816 return; 4817 default: 4818 break; 4819 } 4820 break; 4821 case M_IOCDATA: 4822 icmp_wput_iocdata(q, mp); 4823 return; 4824 default: 4825 /* Unrecognized messages are passed through without change. */ 4826 break; 4827 } 4828 ip_wput_nondata(q, mp); 4829 } 4830 4831 /* 4832 * icmp_wput_iocdata is called by icmp_wput_other to handle all M_IOCDATA 4833 * messages. 4834 */ 4835 static void 4836 icmp_wput_iocdata(queue_t *q, mblk_t *mp) 4837 { 4838 mblk_t *mp1; 4839 STRUCT_HANDLE(strbuf, sb); 4840 uint_t addrlen; 4841 conn_t *connp = Q_TO_CONN(q); 4842 icmp_t *icmp = connp->conn_icmp; 4843 4844 /* Make sure it is one of ours. */ 4845 switch (((struct iocblk *)mp->b_rptr)->ioc_cmd) { 4846 case TI_GETMYNAME: 4847 case TI_GETPEERNAME: 4848 break; 4849 default: 4850 ip_wput_nondata(q, mp); 4851 return; 4852 } 4853 4854 switch (mi_copy_state(q, mp, &mp1)) { 4855 case -1: 4856 return; 4857 case MI_COPY_CASE(MI_COPY_IN, 1): 4858 break; 4859 case MI_COPY_CASE(MI_COPY_OUT, 1): 4860 /* 4861 * The address has been copied out, so now 4862 * copyout the strbuf. 4863 */ 4864 mi_copyout(q, mp); 4865 return; 4866 case MI_COPY_CASE(MI_COPY_OUT, 2): 4867 /* 4868 * The address and strbuf have been copied out. 4869 * We're done, so just acknowledge the original 4870 * M_IOCTL. 4871 */ 4872 mi_copy_done(q, mp, 0); 4873 return; 4874 default: 4875 /* 4876 * Something strange has happened, so acknowledge 4877 * the original M_IOCTL with an EPROTO error. 4878 */ 4879 mi_copy_done(q, mp, EPROTO); 4880 return; 4881 } 4882 4883 /* 4884 * Now we have the strbuf structure for TI_GETMYNAME 4885 * and TI_GETPEERNAME. Next we copyout the requested 4886 * address and then we'll copyout the strbuf. 4887 */ 4888 STRUCT_SET_HANDLE(sb, ((struct iocblk *)mp->b_rptr)->ioc_flag, 4889 (void *)mp1->b_rptr); 4890 4891 if (connp->conn_family == AF_INET) 4892 addrlen = sizeof (sin_t); 4893 else 4894 addrlen = sizeof (sin6_t); 4895 4896 if (STRUCT_FGET(sb, maxlen) < addrlen) { 4897 mi_copy_done(q, mp, EINVAL); 4898 return; 4899 } 4900 switch (((struct iocblk *)mp->b_rptr)->ioc_cmd) { 4901 case TI_GETMYNAME: 4902 break; 4903 case TI_GETPEERNAME: 4904 if (icmp->icmp_state != TS_DATA_XFER) { 4905 mi_copy_done(q, mp, ENOTCONN); 4906 return; 4907 } 4908 break; 4909 default: 4910 mi_copy_done(q, mp, EPROTO); 4911 return; 4912 } 4913 mp1 = mi_copyout_alloc(q, mp, STRUCT_FGETP(sb, buf), addrlen, B_TRUE); 4914 if (!mp1) 4915 return; 4916 4917 STRUCT_FSET(sb, len, addrlen); 4918 switch (((struct iocblk *)mp->b_rptr)->ioc_cmd) { 4919 case TI_GETMYNAME: 4920 (void) conn_getsockname(connp, (struct sockaddr *)mp1->b_wptr, 4921 &addrlen); 4922 break; 4923 case TI_GETPEERNAME: 4924 (void) conn_getpeername(connp, (struct sockaddr *)mp1->b_wptr, 4925 &addrlen); 4926 break; 4927 } 4928 mp1->b_wptr += addrlen; 4929 /* Copy out the address */ 4930 mi_copyout(q, mp); 4931 } 4932 4933 void 4934 icmp_ddi_g_init(void) 4935 { 4936 icmp_max_optsize = optcom_max_optsize(icmp_opt_obj.odb_opt_des_arr, 4937 icmp_opt_obj.odb_opt_arr_cnt); 4938 4939 /* 4940 * We want to be informed each time a stack is created or 4941 * destroyed in the kernel, so we can maintain the 4942 * set of icmp_stack_t's. 4943 */ 4944 netstack_register(NS_ICMP, rawip_stack_init, NULL, rawip_stack_fini); 4945 } 4946 4947 void 4948 icmp_ddi_g_destroy(void) 4949 { 4950 netstack_unregister(NS_ICMP); 4951 } 4952 4953 #define INET_NAME "ip" 4954 4955 /* 4956 * Initialize the ICMP stack instance. 4957 */ 4958 static void * 4959 rawip_stack_init(netstackid_t stackid, netstack_t *ns) 4960 { 4961 icmp_stack_t *is; 4962 int error = 0; 4963 size_t arrsz; 4964 major_t major; 4965 4966 is = (icmp_stack_t *)kmem_zalloc(sizeof (*is), KM_SLEEP); 4967 is->is_netstack = ns; 4968 4969 arrsz = sizeof (icmp_propinfo_tbl); 4970 is->is_propinfo_tbl = (mod_prop_info_t *)kmem_alloc(arrsz, KM_SLEEP); 4971 bcopy(icmp_propinfo_tbl, is->is_propinfo_tbl, arrsz); 4972 4973 is->is_ksp = rawip_kstat_init(stackid); 4974 4975 major = mod_name_to_major(INET_NAME); 4976 error = ldi_ident_from_major(major, &is->is_ldi_ident); 4977 ASSERT(error == 0); 4978 return (is); 4979 } 4980 4981 /* 4982 * Free the ICMP stack instance. 4983 */ 4984 static void 4985 rawip_stack_fini(netstackid_t stackid, void *arg) 4986 { 4987 icmp_stack_t *is = (icmp_stack_t *)arg; 4988 4989 kmem_free(is->is_propinfo_tbl, sizeof (icmp_propinfo_tbl)); 4990 is->is_propinfo_tbl = NULL; 4991 4992 rawip_kstat_fini(stackid, is->is_ksp); 4993 is->is_ksp = NULL; 4994 ldi_ident_release(is->is_ldi_ident); 4995 kmem_free(is, sizeof (*is)); 4996 } 4997 4998 static void * 4999 rawip_kstat_init(netstackid_t stackid) { 5000 kstat_t *ksp; 5001 5002 rawip_named_kstat_t template = { 5003 { "inDatagrams", KSTAT_DATA_UINT32, 0 }, 5004 { "inCksumErrs", KSTAT_DATA_UINT32, 0 }, 5005 { "inErrors", KSTAT_DATA_UINT32, 0 }, 5006 { "outDatagrams", KSTAT_DATA_UINT32, 0 }, 5007 { "outErrors", KSTAT_DATA_UINT32, 0 }, 5008 }; 5009 5010 ksp = kstat_create_netstack("icmp", 0, "rawip", "mib2", 5011 KSTAT_TYPE_NAMED, 5012 NUM_OF_FIELDS(rawip_named_kstat_t), 5013 0, stackid); 5014 if (ksp == NULL || ksp->ks_data == NULL) 5015 return (NULL); 5016 5017 bcopy(&template, ksp->ks_data, sizeof (template)); 5018 ksp->ks_update = rawip_kstat_update; 5019 ksp->ks_private = (void *)(uintptr_t)stackid; 5020 5021 kstat_install(ksp); 5022 return (ksp); 5023 } 5024 5025 static void 5026 rawip_kstat_fini(netstackid_t stackid, kstat_t *ksp) 5027 { 5028 if (ksp != NULL) { 5029 ASSERT(stackid == (netstackid_t)(uintptr_t)ksp->ks_private); 5030 kstat_delete_netstack(ksp, stackid); 5031 } 5032 } 5033 5034 static int 5035 rawip_kstat_update(kstat_t *ksp, int rw) 5036 { 5037 rawip_named_kstat_t *rawipkp; 5038 netstackid_t stackid = (netstackid_t)(uintptr_t)ksp->ks_private; 5039 netstack_t *ns; 5040 icmp_stack_t *is; 5041 5042 if ((ksp == NULL) || (ksp->ks_data == NULL)) 5043 return (EIO); 5044 5045 if (rw == KSTAT_WRITE) 5046 return (EACCES); 5047 5048 rawipkp = (rawip_named_kstat_t *)ksp->ks_data; 5049 5050 ns = netstack_find_by_stackid(stackid); 5051 if (ns == NULL) 5052 return (-1); 5053 is = ns->netstack_icmp; 5054 if (is == NULL) { 5055 netstack_rele(ns); 5056 return (-1); 5057 } 5058 rawipkp->inDatagrams.value.ui32 = is->is_rawip_mib.rawipInDatagrams; 5059 rawipkp->inCksumErrs.value.ui32 = is->is_rawip_mib.rawipInCksumErrs; 5060 rawipkp->inErrors.value.ui32 = is->is_rawip_mib.rawipInErrors; 5061 rawipkp->outDatagrams.value.ui32 = is->is_rawip_mib.rawipOutDatagrams; 5062 rawipkp->outErrors.value.ui32 = is->is_rawip_mib.rawipOutErrors; 5063 netstack_rele(ns); 5064 return (0); 5065 } 5066 5067 /* ARGSUSED */ 5068 int 5069 rawip_accept(sock_lower_handle_t lproto_handle, 5070 sock_lower_handle_t eproto_handle, sock_upper_handle_t sock_handle, 5071 cred_t *cr) 5072 { 5073 return (EOPNOTSUPP); 5074 } 5075 5076 /* ARGSUSED */ 5077 int 5078 rawip_bind(sock_lower_handle_t proto_handle, struct sockaddr *sa, 5079 socklen_t len, cred_t *cr) 5080 { 5081 conn_t *connp = (conn_t *)proto_handle; 5082 int error; 5083 5084 /* All Solaris components should pass a cred for this operation. */ 5085 ASSERT(cr != NULL); 5086 5087 /* Binding to a NULL address really means unbind */ 5088 if (sa == NULL) 5089 error = rawip_do_unbind(connp); 5090 else 5091 error = rawip_do_bind(connp, sa, len); 5092 5093 if (error < 0) { 5094 if (error == -TOUTSTATE) 5095 error = EINVAL; 5096 else 5097 error = proto_tlitosyserr(-error); 5098 } 5099 return (error); 5100 } 5101 5102 static int 5103 rawip_implicit_bind(conn_t *connp) 5104 { 5105 sin6_t sin6addr; 5106 sin_t *sin; 5107 sin6_t *sin6; 5108 socklen_t len; 5109 int error; 5110 5111 if (connp->conn_family == AF_INET) { 5112 len = sizeof (struct sockaddr_in); 5113 sin = (sin_t *)&sin6addr; 5114 *sin = sin_null; 5115 sin->sin_family = AF_INET; 5116 sin->sin_addr.s_addr = INADDR_ANY; 5117 } else { 5118 ASSERT(connp->conn_family == AF_INET6); 5119 len = sizeof (sin6_t); 5120 sin6 = (sin6_t *)&sin6addr; 5121 *sin6 = sin6_null; 5122 sin6->sin6_family = AF_INET6; 5123 V6_SET_ZERO(sin6->sin6_addr); 5124 } 5125 5126 error = rawip_do_bind(connp, (struct sockaddr *)&sin6addr, len); 5127 5128 return ((error < 0) ? proto_tlitosyserr(-error) : error); 5129 } 5130 5131 static int 5132 rawip_unbind(conn_t *connp) 5133 { 5134 int error; 5135 5136 error = rawip_do_unbind(connp); 5137 if (error < 0) { 5138 error = proto_tlitosyserr(-error); 5139 } 5140 return (error); 5141 } 5142 5143 /* ARGSUSED */ 5144 int 5145 rawip_listen(sock_lower_handle_t proto_handle, int backlog, cred_t *cr) 5146 { 5147 return (EOPNOTSUPP); 5148 } 5149 5150 int 5151 rawip_connect(sock_lower_handle_t proto_handle, const struct sockaddr *sa, 5152 socklen_t len, sock_connid_t *id, cred_t *cr) 5153 { 5154 conn_t *connp = (conn_t *)proto_handle; 5155 icmp_t *icmp = connp->conn_icmp; 5156 int error; 5157 boolean_t did_bind = B_FALSE; 5158 pid_t pid = curproc->p_pid; 5159 5160 /* All Solaris components should pass a cred for this operation. */ 5161 ASSERT(cr != NULL); 5162 5163 if (sa == NULL) { 5164 /* 5165 * Disconnect 5166 * Make sure we are connected 5167 */ 5168 if (icmp->icmp_state != TS_DATA_XFER) 5169 return (EINVAL); 5170 5171 error = icmp_disconnect(connp); 5172 return (error); 5173 } 5174 5175 error = proto_verify_ip_addr(connp->conn_family, sa, len); 5176 if (error != 0) 5177 return (error); 5178 5179 /* do an implicit bind if necessary */ 5180 if (icmp->icmp_state == TS_UNBND) { 5181 error = rawip_implicit_bind(connp); 5182 /* 5183 * We could be racing with an actual bind, in which case 5184 * we would see EPROTO. We cross our fingers and try 5185 * to connect. 5186 */ 5187 if (!(error == 0 || error == EPROTO)) 5188 return (error); 5189 did_bind = B_TRUE; 5190 } 5191 5192 /* 5193 * set SO_DGRAM_ERRIND 5194 */ 5195 connp->conn_dgram_errind = B_TRUE; 5196 5197 error = rawip_do_connect(connp, sa, len, cr, pid); 5198 if (error != 0 && did_bind) { 5199 int unbind_err; 5200 5201 unbind_err = rawip_unbind(connp); 5202 ASSERT(unbind_err == 0); 5203 } 5204 5205 if (error == 0) { 5206 *id = 0; 5207 (*connp->conn_upcalls->su_connected)(connp->conn_upper_handle, 5208 0, NULL, -1); 5209 } else if (error < 0) { 5210 error = proto_tlitosyserr(-error); 5211 } 5212 return (error); 5213 } 5214 5215 /* ARGSUSED2 */ 5216 int 5217 rawip_fallback(sock_lower_handle_t proto_handle, queue_t *q, 5218 boolean_t direct_sockfs, so_proto_quiesced_cb_t quiesced_cb, 5219 sock_quiesce_arg_t *arg) 5220 { 5221 conn_t *connp = (conn_t *)proto_handle; 5222 icmp_t *icmp; 5223 struct T_capability_ack tca; 5224 struct sockaddr_in6 laddr, faddr; 5225 socklen_t laddrlen, faddrlen; 5226 short opts; 5227 struct stroptions *stropt; 5228 mblk_t *mp, *stropt_mp; 5229 int error; 5230 5231 icmp = connp->conn_icmp; 5232 5233 stropt_mp = allocb_wait(sizeof (*stropt), BPRI_HI, STR_NOSIG, NULL); 5234 5235 /* 5236 * setup the fallback stream that was allocated 5237 */ 5238 connp->conn_dev = (dev_t)RD(q)->q_ptr; 5239 connp->conn_minor_arena = WR(q)->q_ptr; 5240 5241 RD(q)->q_ptr = WR(q)->q_ptr = connp; 5242 5243 WR(q)->q_qinfo = &icmpwinit; 5244 5245 connp->conn_rq = RD(q); 5246 connp->conn_wq = WR(q); 5247 5248 /* Notify stream head about options before sending up data */ 5249 stropt_mp->b_datap->db_type = M_SETOPTS; 5250 stropt_mp->b_wptr += sizeof (*stropt); 5251 stropt = (struct stroptions *)stropt_mp->b_rptr; 5252 stropt->so_flags = SO_WROFF | SO_HIWAT; 5253 stropt->so_wroff = connp->conn_wroff; 5254 stropt->so_hiwat = connp->conn_rcvbuf; 5255 putnext(RD(q), stropt_mp); 5256 5257 /* 5258 * free helper stream 5259 */ 5260 ip_free_helper_stream(connp); 5261 5262 /* 5263 * Collect the information needed to sync with the sonode 5264 */ 5265 icmp_do_capability_ack(icmp, &tca, TC1_INFO); 5266 5267 laddrlen = faddrlen = sizeof (sin6_t); 5268 (void) rawip_getsockname((sock_lower_handle_t)connp, 5269 (struct sockaddr *)&laddr, &laddrlen, CRED()); 5270 error = rawip_getpeername((sock_lower_handle_t)connp, 5271 (struct sockaddr *)&faddr, &faddrlen, CRED()); 5272 if (error != 0) 5273 faddrlen = 0; 5274 opts = 0; 5275 if (connp->conn_dgram_errind) 5276 opts |= SO_DGRAM_ERRIND; 5277 if (connp->conn_ixa->ixa_flags & IXAF_DONTROUTE) 5278 opts |= SO_DONTROUTE; 5279 5280 mp = (*quiesced_cb)(connp->conn_upper_handle, arg, &tca, 5281 (struct sockaddr *)&laddr, laddrlen, 5282 (struct sockaddr *)&faddr, faddrlen, opts); 5283 5284 /* 5285 * Attempts to send data up during fallback will result in it being 5286 * queued in icmp_t. Now we push up any queued packets. 5287 */ 5288 mutex_enter(&icmp->icmp_recv_lock); 5289 if (mp != NULL) { 5290 mp->b_next = icmp->icmp_fallback_queue_head; 5291 icmp->icmp_fallback_queue_head = mp; 5292 } 5293 while (icmp->icmp_fallback_queue_head != NULL) { 5294 mp = icmp->icmp_fallback_queue_head; 5295 icmp->icmp_fallback_queue_head = mp->b_next; 5296 mp->b_next = NULL; 5297 mutex_exit(&icmp->icmp_recv_lock); 5298 putnext(RD(q), mp); 5299 mutex_enter(&icmp->icmp_recv_lock); 5300 } 5301 icmp->icmp_fallback_queue_tail = icmp->icmp_fallback_queue_head; 5302 5303 /* 5304 * No longer a streams less socket 5305 */ 5306 mutex_enter(&connp->conn_lock); 5307 connp->conn_flags &= ~IPCL_NONSTR; 5308 mutex_exit(&connp->conn_lock); 5309 5310 mutex_exit(&icmp->icmp_recv_lock); 5311 5312 ASSERT(icmp->icmp_fallback_queue_head == NULL && 5313 icmp->icmp_fallback_queue_tail == NULL); 5314 5315 ASSERT(connp->conn_ref >= 1); 5316 5317 return (0); 5318 } 5319 5320 /* ARGSUSED2 */ 5321 sock_lower_handle_t 5322 rawip_create(int family, int type, int proto, sock_downcalls_t **sock_downcalls, 5323 uint_t *smodep, int *errorp, int flags, cred_t *credp) 5324 { 5325 conn_t *connp; 5326 5327 if (type != SOCK_RAW || (family != AF_INET && family != AF_INET6)) { 5328 *errorp = EPROTONOSUPPORT; 5329 return (NULL); 5330 } 5331 5332 connp = rawip_do_open(family, credp, errorp, flags); 5333 if (connp != NULL) { 5334 connp->conn_flags |= IPCL_NONSTR; 5335 5336 mutex_enter(&connp->conn_lock); 5337 connp->conn_state_flags &= ~CONN_INCIPIENT; 5338 mutex_exit(&connp->conn_lock); 5339 *sock_downcalls = &sock_rawip_downcalls; 5340 *smodep = SM_ATOMIC; 5341 } else { 5342 ASSERT(*errorp != 0); 5343 } 5344 5345 return ((sock_lower_handle_t)connp); 5346 } 5347 5348 /* ARGSUSED3 */ 5349 void 5350 rawip_activate(sock_lower_handle_t proto_handle, 5351 sock_upper_handle_t sock_handle, sock_upcalls_t *sock_upcalls, int flags, 5352 cred_t *cr) 5353 { 5354 conn_t *connp = (conn_t *)proto_handle; 5355 struct sock_proto_props sopp; 5356 5357 /* All Solaris components should pass a cred for this operation. */ 5358 ASSERT(cr != NULL); 5359 5360 connp->conn_upcalls = sock_upcalls; 5361 connp->conn_upper_handle = sock_handle; 5362 5363 sopp.sopp_flags = SOCKOPT_WROFF | SOCKOPT_RCVHIWAT | SOCKOPT_RCVLOWAT | 5364 SOCKOPT_MAXBLK | SOCKOPT_MAXPSZ | SOCKOPT_MINPSZ; 5365 sopp.sopp_wroff = connp->conn_wroff; 5366 sopp.sopp_rxhiwat = connp->conn_rcvbuf; 5367 sopp.sopp_rxlowat = connp->conn_rcvlowat; 5368 sopp.sopp_maxblk = INFPSZ; 5369 sopp.sopp_maxpsz = IP_MAXPACKET; 5370 sopp.sopp_minpsz = (icmp_mod_info.mi_minpsz == 1) ? 0 : 5371 icmp_mod_info.mi_minpsz; 5372 5373 (*connp->conn_upcalls->su_set_proto_props) 5374 (connp->conn_upper_handle, &sopp); 5375 5376 icmp_bind_proto(connp->conn_icmp); 5377 } 5378 5379 /* ARGSUSED3 */ 5380 int 5381 rawip_getpeername(sock_lower_handle_t proto_handle, struct sockaddr *sa, 5382 socklen_t *salenp, cred_t *cr) 5383 { 5384 conn_t *connp = (conn_t *)proto_handle; 5385 icmp_t *icmp = connp->conn_icmp; 5386 int error; 5387 5388 /* All Solaris components should pass a cred for this operation. */ 5389 ASSERT(cr != NULL); 5390 5391 mutex_enter(&connp->conn_lock); 5392 if (icmp->icmp_state != TS_DATA_XFER) 5393 error = ENOTCONN; 5394 else 5395 error = conn_getpeername(connp, sa, salenp); 5396 mutex_exit(&connp->conn_lock); 5397 return (error); 5398 } 5399 5400 /* ARGSUSED3 */ 5401 int 5402 rawip_getsockname(sock_lower_handle_t proto_handle, struct sockaddr *sa, 5403 socklen_t *salenp, cred_t *cr) 5404 { 5405 conn_t *connp = (conn_t *)proto_handle; 5406 int error; 5407 5408 /* All Solaris components should pass a cred for this operation. */ 5409 ASSERT(cr != NULL); 5410 5411 mutex_enter(&connp->conn_lock); 5412 error = conn_getsockname(connp, sa, salenp); 5413 mutex_exit(&connp->conn_lock); 5414 return (error); 5415 } 5416 5417 int 5418 rawip_setsockopt(sock_lower_handle_t proto_handle, int level, int option_name, 5419 const void *optvalp, socklen_t optlen, cred_t *cr) 5420 { 5421 conn_t *connp = (conn_t *)proto_handle; 5422 int error; 5423 5424 /* All Solaris components should pass a cred for this operation. */ 5425 ASSERT(cr != NULL); 5426 5427 error = proto_opt_check(level, option_name, optlen, NULL, 5428 icmp_opt_obj.odb_opt_des_arr, 5429 icmp_opt_obj.odb_opt_arr_cnt, 5430 B_TRUE, B_FALSE, cr); 5431 5432 if (error != 0) { 5433 /* 5434 * option not recognized 5435 */ 5436 if (error < 0) { 5437 error = proto_tlitosyserr(-error); 5438 } 5439 return (error); 5440 } 5441 5442 error = icmp_opt_set(connp, SETFN_OPTCOM_NEGOTIATE, level, 5443 option_name, optlen, (uchar_t *)optvalp, (uint_t *)&optlen, 5444 (uchar_t *)optvalp, NULL, cr); 5445 5446 ASSERT(error >= 0); 5447 5448 return (error); 5449 } 5450 5451 int 5452 rawip_getsockopt(sock_lower_handle_t proto_handle, int level, int option_name, 5453 void *optvalp, socklen_t *optlen, cred_t *cr) 5454 { 5455 int error; 5456 conn_t *connp = (conn_t *)proto_handle; 5457 t_uscalar_t max_optbuf_len; 5458 void *optvalp_buf; 5459 int len; 5460 5461 /* All Solaris components should pass a cred for this operation. */ 5462 ASSERT(cr != NULL); 5463 5464 error = proto_opt_check(level, option_name, *optlen, &max_optbuf_len, 5465 icmp_opt_obj.odb_opt_des_arr, 5466 icmp_opt_obj.odb_opt_arr_cnt, 5467 B_FALSE, B_TRUE, cr); 5468 5469 if (error != 0) { 5470 if (error < 0) { 5471 error = proto_tlitosyserr(-error); 5472 } 5473 return (error); 5474 } 5475 5476 optvalp_buf = kmem_alloc(max_optbuf_len, KM_SLEEP); 5477 len = icmp_opt_get(connp, level, option_name, optvalp_buf); 5478 if (len == -1) { 5479 kmem_free(optvalp_buf, max_optbuf_len); 5480 return (EINVAL); 5481 } 5482 5483 /* 5484 * update optlen and copy option value 5485 */ 5486 t_uscalar_t size = MIN(len, *optlen); 5487 5488 bcopy(optvalp_buf, optvalp, size); 5489 bcopy(&size, optlen, sizeof (size)); 5490 5491 kmem_free(optvalp_buf, max_optbuf_len); 5492 return (0); 5493 } 5494 5495 /* ARGSUSED1 */ 5496 int 5497 rawip_close(sock_lower_handle_t proto_handle, int flags, cred_t *cr) 5498 { 5499 conn_t *connp = (conn_t *)proto_handle; 5500 5501 /* All Solaris components should pass a cred for this operation. */ 5502 ASSERT(cr != NULL); 5503 5504 (void) rawip_do_close(connp); 5505 return (0); 5506 } 5507 5508 /* ARGSUSED2 */ 5509 int 5510 rawip_shutdown(sock_lower_handle_t proto_handle, int how, cred_t *cr) 5511 { 5512 conn_t *connp = (conn_t *)proto_handle; 5513 5514 /* All Solaris components should pass a cred for this operation. */ 5515 ASSERT(cr != NULL); 5516 5517 /* shut down the send side */ 5518 if (how != SHUT_RD) 5519 (*connp->conn_upcalls->su_opctl)(connp->conn_upper_handle, 5520 SOCK_OPCTL_SHUT_SEND, 0); 5521 /* shut down the recv side */ 5522 if (how != SHUT_WR) 5523 (*connp->conn_upcalls->su_opctl)(connp->conn_upper_handle, 5524 SOCK_OPCTL_SHUT_RECV, 0); 5525 return (0); 5526 } 5527 5528 void 5529 rawip_clr_flowctrl(sock_lower_handle_t proto_handle) 5530 { 5531 conn_t *connp = (conn_t *)proto_handle; 5532 icmp_t *icmp = connp->conn_icmp; 5533 5534 mutex_enter(&icmp->icmp_recv_lock); 5535 connp->conn_flow_cntrld = B_FALSE; 5536 mutex_exit(&icmp->icmp_recv_lock); 5537 } 5538 5539 int 5540 rawip_ioctl(sock_lower_handle_t proto_handle, int cmd, intptr_t arg, 5541 int mode, int32_t *rvalp, cred_t *cr) 5542 { 5543 conn_t *connp = (conn_t *)proto_handle; 5544 int error; 5545 5546 /* All Solaris components should pass a cred for this operation. */ 5547 ASSERT(cr != NULL); 5548 5549 /* 5550 * If we don't have a helper stream then create one. 5551 * ip_create_helper_stream takes care of locking the conn_t, 5552 * so this check for NULL is just a performance optimization. 5553 */ 5554 if (connp->conn_helper_info == NULL) { 5555 icmp_stack_t *is = connp->conn_icmp->icmp_is; 5556 5557 ASSERT(is->is_ldi_ident != NULL); 5558 5559 /* 5560 * Create a helper stream for non-STREAMS socket. 5561 */ 5562 error = ip_create_helper_stream(connp, is->is_ldi_ident); 5563 if (error != 0) { 5564 ip0dbg(("rawip_ioctl: create of IP helper stream " 5565 "failed %d\n", error)); 5566 return (error); 5567 } 5568 } 5569 5570 switch (cmd) { 5571 case _SIOCSOCKFALLBACK: 5572 case TI_GETPEERNAME: 5573 case TI_GETMYNAME: 5574 #ifdef DEBUG 5575 cmn_err(CE_CONT, "icmp_ioctl cmd 0x%x on non streams" 5576 " socket", cmd); 5577 #endif 5578 error = EINVAL; 5579 break; 5580 default: 5581 /* 5582 * Pass on to IP using helper stream 5583 */ 5584 error = ldi_ioctl(connp->conn_helper_info->iphs_handle, 5585 cmd, arg, mode, cr, rvalp); 5586 break; 5587 } 5588 return (error); 5589 } 5590 5591 int 5592 rawip_send(sock_lower_handle_t proto_handle, mblk_t *mp, struct nmsghdr *msg, 5593 cred_t *cr) 5594 { 5595 sin6_t *sin6; 5596 sin_t *sin = NULL; 5597 uint_t srcid; 5598 conn_t *connp = (conn_t *)proto_handle; 5599 icmp_t *icmp = connp->conn_icmp; 5600 int error = 0; 5601 icmp_stack_t *is = icmp->icmp_is; 5602 pid_t pid = curproc->p_pid; 5603 ip_xmit_attr_t *ixa; 5604 5605 ASSERT(DB_TYPE(mp) == M_DATA); 5606 5607 /* All Solaris components should pass a cred for this operation. */ 5608 ASSERT(cr != NULL); 5609 5610 /* do an implicit bind if necessary */ 5611 if (icmp->icmp_state == TS_UNBND) { 5612 error = rawip_implicit_bind(connp); 5613 /* 5614 * We could be racing with an actual bind, in which case 5615 * we would see EPROTO. We cross our fingers and try 5616 * to connect. 5617 */ 5618 if (!(error == 0 || error == EPROTO)) { 5619 freemsg(mp); 5620 return (error); 5621 } 5622 } 5623 5624 /* Protocol 255 contains full IP headers */ 5625 /* Read without holding lock */ 5626 if (icmp->icmp_hdrincl) { 5627 ASSERT(connp->conn_ipversion == IPV4_VERSION); 5628 if (mp->b_wptr - mp->b_rptr < IP_SIMPLE_HDR_LENGTH) { 5629 if (!pullupmsg(mp, IP_SIMPLE_HDR_LENGTH)) { 5630 BUMP_MIB(&is->is_rawip_mib, rawipOutErrors); 5631 freemsg(mp); 5632 return (EINVAL); 5633 } 5634 } 5635 error = icmp_output_hdrincl(connp, mp, cr, pid); 5636 if (is->is_sendto_ignerr) 5637 return (0); 5638 else 5639 return (error); 5640 } 5641 5642 /* Connected? */ 5643 if (msg->msg_name == NULL) { 5644 if (icmp->icmp_state != TS_DATA_XFER) { 5645 BUMP_MIB(&is->is_rawip_mib, rawipOutErrors); 5646 return (EDESTADDRREQ); 5647 } 5648 if (msg->msg_controllen != 0) { 5649 error = icmp_output_ancillary(connp, NULL, NULL, mp, 5650 NULL, msg, cr, pid); 5651 } else { 5652 error = icmp_output_connected(connp, mp, cr, pid); 5653 } 5654 if (is->is_sendto_ignerr) 5655 return (0); 5656 else 5657 return (error); 5658 } 5659 if (icmp->icmp_state == TS_DATA_XFER) { 5660 BUMP_MIB(&is->is_rawip_mib, rawipOutErrors); 5661 return (EISCONN); 5662 } 5663 error = proto_verify_ip_addr(connp->conn_family, 5664 (struct sockaddr *)msg->msg_name, msg->msg_namelen); 5665 if (error != 0) { 5666 BUMP_MIB(&is->is_rawip_mib, rawipOutErrors); 5667 return (error); 5668 } 5669 switch (connp->conn_family) { 5670 case AF_INET6: 5671 sin6 = (sin6_t *)msg->msg_name; 5672 5673 /* No support for mapped addresses on raw sockets */ 5674 if (IN6_IS_ADDR_V4MAPPED(&sin6->sin6_addr)) { 5675 BUMP_MIB(&is->is_rawip_mib, rawipOutErrors); 5676 return (EADDRNOTAVAIL); 5677 } 5678 srcid = sin6->__sin6_src_id; 5679 5680 /* 5681 * If the local address is a mapped address return 5682 * an error. 5683 * It would be possible to send an IPv6 packet but the 5684 * response would never make it back to the application 5685 * since it is bound to a mapped address. 5686 */ 5687 if (IN6_IS_ADDR_V4MAPPED(&connp->conn_saddr_v6)) { 5688 BUMP_MIB(&is->is_rawip_mib, rawipOutErrors); 5689 return (EADDRNOTAVAIL); 5690 } 5691 5692 if (IN6_IS_ADDR_UNSPECIFIED(&sin6->sin6_addr)) 5693 sin6->sin6_addr = ipv6_loopback; 5694 5695 /* 5696 * We have to allocate an ip_xmit_attr_t before we grab 5697 * conn_lock and we need to hold conn_lock once we've check 5698 * conn_same_as_last_v6 to handle concurrent send* calls on a 5699 * socket. 5700 */ 5701 if (msg->msg_controllen == 0) { 5702 ixa = conn_get_ixa(connp, B_FALSE); 5703 if (ixa == NULL) { 5704 BUMP_MIB(&is->is_rawip_mib, rawipOutErrors); 5705 return (ENOMEM); 5706 } 5707 } else { 5708 ixa = NULL; 5709 } 5710 mutex_enter(&connp->conn_lock); 5711 if (icmp->icmp_delayed_error != 0) { 5712 sin6_t *sin2 = (sin6_t *)&icmp->icmp_delayed_addr; 5713 5714 error = icmp->icmp_delayed_error; 5715 icmp->icmp_delayed_error = 0; 5716 5717 /* Compare IP address and family */ 5718 5719 if (IN6_ARE_ADDR_EQUAL(&sin6->sin6_addr, 5720 &sin2->sin6_addr) && 5721 sin6->sin6_family == sin2->sin6_family) { 5722 mutex_exit(&connp->conn_lock); 5723 BUMP_MIB(&is->is_rawip_mib, rawipOutErrors); 5724 if (ixa != NULL) 5725 ixa_refrele(ixa); 5726 return (error); 5727 } 5728 } 5729 if (msg->msg_controllen != 0) { 5730 mutex_exit(&connp->conn_lock); 5731 ASSERT(ixa == NULL); 5732 error = icmp_output_ancillary(connp, NULL, sin6, mp, 5733 NULL, msg, cr, pid); 5734 } else if (conn_same_as_last_v6(connp, sin6) && 5735 connp->conn_lastsrcid == srcid && 5736 ipsec_outbound_policy_current(ixa)) { 5737 /* icmp_output_lastdst drops conn_lock */ 5738 error = icmp_output_lastdst(connp, mp, cr, pid, ixa); 5739 } else { 5740 /* icmp_output_newdst drops conn_lock */ 5741 error = icmp_output_newdst(connp, mp, NULL, sin6, cr, 5742 pid, ixa); 5743 } 5744 ASSERT(MUTEX_NOT_HELD(&connp->conn_lock)); 5745 if (is->is_sendto_ignerr) 5746 return (0); 5747 else 5748 return (error); 5749 case AF_INET: 5750 sin = (sin_t *)msg->msg_name; 5751 5752 if (sin->sin_addr.s_addr == INADDR_ANY) 5753 sin->sin_addr.s_addr = htonl(INADDR_LOOPBACK); 5754 5755 /* 5756 * We have to allocate an ip_xmit_attr_t before we grab 5757 * conn_lock and we need to hold conn_lock once we've check 5758 * conn_same_as_last_v6 to handle concurrent send* on a socket. 5759 */ 5760 if (msg->msg_controllen == 0) { 5761 ixa = conn_get_ixa(connp, B_FALSE); 5762 if (ixa == NULL) { 5763 BUMP_MIB(&is->is_rawip_mib, rawipOutErrors); 5764 return (ENOMEM); 5765 } 5766 } else { 5767 ixa = NULL; 5768 } 5769 mutex_enter(&connp->conn_lock); 5770 if (icmp->icmp_delayed_error != 0) { 5771 sin_t *sin2 = (sin_t *)&icmp->icmp_delayed_addr; 5772 5773 error = icmp->icmp_delayed_error; 5774 icmp->icmp_delayed_error = 0; 5775 5776 /* Compare IP address */ 5777 5778 if (sin->sin_addr.s_addr == sin2->sin_addr.s_addr) { 5779 mutex_exit(&connp->conn_lock); 5780 BUMP_MIB(&is->is_rawip_mib, rawipOutErrors); 5781 if (ixa != NULL) 5782 ixa_refrele(ixa); 5783 return (error); 5784 } 5785 } 5786 5787 if (msg->msg_controllen != 0) { 5788 mutex_exit(&connp->conn_lock); 5789 ASSERT(ixa == NULL); 5790 error = icmp_output_ancillary(connp, sin, NULL, mp, 5791 NULL, msg, cr, pid); 5792 } else if (conn_same_as_last_v4(connp, sin) && 5793 ipsec_outbound_policy_current(ixa)) { 5794 /* icmp_output_lastdst drops conn_lock */ 5795 error = icmp_output_lastdst(connp, mp, cr, pid, ixa); 5796 } else { 5797 /* icmp_output_newdst drops conn_lock */ 5798 error = icmp_output_newdst(connp, mp, sin, NULL, cr, 5799 pid, ixa); 5800 } 5801 ASSERT(MUTEX_NOT_HELD(&connp->conn_lock)); 5802 if (is->is_sendto_ignerr) 5803 return (0); 5804 else 5805 return (error); 5806 default: 5807 return (EINVAL); 5808 } 5809 } 5810 5811 sock_downcalls_t sock_rawip_downcalls = { 5812 rawip_activate, 5813 rawip_accept, 5814 rawip_bind, 5815 rawip_listen, 5816 rawip_connect, 5817 rawip_getpeername, 5818 rawip_getsockname, 5819 rawip_getsockopt, 5820 rawip_setsockopt, 5821 rawip_send, 5822 NULL, 5823 NULL, 5824 NULL, 5825 rawip_shutdown, 5826 rawip_clr_flowctrl, 5827 rawip_ioctl, 5828 rawip_close 5829 }; 5830