1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright (c) 1991, 2010, Oracle and/or its affiliates. All rights reserved. 23 */ 24 /* Copyright (c) 1990 Mentat Inc. */ 25 26 #include <sys/types.h> 27 #include <sys/stream.h> 28 #include <sys/stropts.h> 29 #include <sys/strlog.h> 30 #include <sys/strsun.h> 31 #define _SUN_TPI_VERSION 2 32 #include <sys/tihdr.h> 33 #include <sys/timod.h> 34 #include <sys/ddi.h> 35 #include <sys/sunddi.h> 36 #include <sys/strsubr.h> 37 #include <sys/suntpi.h> 38 #include <sys/xti_inet.h> 39 #include <sys/cmn_err.h> 40 #include <sys/kmem.h> 41 #include <sys/cred.h> 42 #include <sys/policy.h> 43 #include <sys/priv.h> 44 #include <sys/ucred.h> 45 #include <sys/zone.h> 46 47 #include <sys/sockio.h> 48 #include <sys/socket.h> 49 #include <sys/socketvar.h> 50 #include <sys/vtrace.h> 51 #include <sys/sdt.h> 52 #include <sys/debug.h> 53 #include <sys/isa_defs.h> 54 #include <sys/random.h> 55 #include <netinet/in.h> 56 #include <netinet/ip6.h> 57 #include <netinet/icmp6.h> 58 #include <netinet/udp.h> 59 60 #include <inet/common.h> 61 #include <inet/ip.h> 62 #include <inet/ip_impl.h> 63 #include <inet/ipsec_impl.h> 64 #include <inet/ip6.h> 65 #include <inet/ip_ire.h> 66 #include <inet/ip_if.h> 67 #include <inet/ip_multi.h> 68 #include <inet/ip_ndp.h> 69 #include <inet/proto_set.h> 70 #include <inet/mib2.h> 71 #include <inet/nd.h> 72 #include <inet/optcom.h> 73 #include <inet/snmpcom.h> 74 #include <inet/kstatcom.h> 75 #include <inet/ipclassifier.h> 76 77 #include <sys/tsol/label.h> 78 #include <sys/tsol/tnet.h> 79 80 #include <inet/rawip_impl.h> 81 82 #include <sys/disp.h> 83 84 /* 85 * Synchronization notes: 86 * 87 * RAWIP is MT and uses the usual kernel synchronization primitives. We use 88 * conn_lock to protect the icmp_t. 89 * 90 * Plumbing notes: 91 * ICMP is always a device driver. For compatibility with mibopen() code 92 * it is possible to I_PUSH "icmp", but that results in pushing a passthrough 93 * dummy module. 94 */ 95 96 static void icmp_addr_req(queue_t *q, mblk_t *mp); 97 static void icmp_tpi_bind(queue_t *q, mblk_t *mp); 98 static void icmp_bind_proto(icmp_t *icmp); 99 static int icmp_build_hdr_template(conn_t *, const in6_addr_t *, 100 const in6_addr_t *, uint32_t); 101 static void icmp_capability_req(queue_t *q, mblk_t *mp); 102 static int icmp_close(queue_t *q, int flags); 103 static void icmp_close_free(conn_t *); 104 static void icmp_tpi_connect(queue_t *q, mblk_t *mp); 105 static void icmp_tpi_disconnect(queue_t *q, mblk_t *mp); 106 static void icmp_err_ack(queue_t *q, mblk_t *mp, t_scalar_t t_error, 107 int sys_error); 108 static void icmp_err_ack_prim(queue_t *q, mblk_t *mp, t_scalar_t primitive, 109 t_scalar_t tlierr, int sys_error); 110 static void icmp_icmp_input(void *arg1, mblk_t *mp, void *arg2, 111 ip_recv_attr_t *); 112 static void icmp_icmp_error_ipv6(conn_t *connp, mblk_t *mp, 113 ip_recv_attr_t *); 114 static void icmp_info_req(queue_t *q, mblk_t *mp); 115 static void icmp_input(void *, mblk_t *, void *, ip_recv_attr_t *); 116 static conn_t *icmp_open(int family, cred_t *credp, int *err, int flags); 117 static int icmp_openv4(queue_t *q, dev_t *devp, int flag, int sflag, 118 cred_t *credp); 119 static int icmp_openv6(queue_t *q, dev_t *devp, int flag, int sflag, 120 cred_t *credp); 121 static boolean_t icmp_opt_allow_udr_set(t_scalar_t level, t_scalar_t name); 122 int icmp_opt_set(conn_t *connp, uint_t optset_context, 123 int level, int name, uint_t inlen, 124 uchar_t *invalp, uint_t *outlenp, uchar_t *outvalp, 125 void *thisdg_attrs, cred_t *cr); 126 int icmp_opt_get(conn_t *connp, int level, int name, 127 uchar_t *ptr); 128 static int icmp_output_newdst(conn_t *connp, mblk_t *data_mp, sin_t *sin, 129 sin6_t *sin6, cred_t *cr, pid_t pid, ip_xmit_attr_t *ixa); 130 static mblk_t *icmp_prepend_hdr(conn_t *, ip_xmit_attr_t *, const ip_pkt_t *, 131 const in6_addr_t *, const in6_addr_t *, uint32_t, mblk_t *, int *); 132 static mblk_t *icmp_prepend_header_template(conn_t *, ip_xmit_attr_t *, 133 mblk_t *, const in6_addr_t *, uint32_t, int *); 134 static int icmp_snmp_set(queue_t *q, t_scalar_t level, t_scalar_t name, 135 uchar_t *ptr, int len); 136 static void icmp_ud_err(queue_t *q, mblk_t *mp, t_scalar_t err); 137 static void icmp_tpi_unbind(queue_t *q, mblk_t *mp); 138 static void icmp_wput(queue_t *q, mblk_t *mp); 139 static void icmp_wput_fallback(queue_t *q, mblk_t *mp); 140 static void icmp_wput_other(queue_t *q, mblk_t *mp); 141 static void icmp_wput_iocdata(queue_t *q, mblk_t *mp); 142 static void icmp_wput_restricted(queue_t *q, mblk_t *mp); 143 static void icmp_ulp_recv(conn_t *, mblk_t *, uint_t); 144 145 static void *rawip_stack_init(netstackid_t stackid, netstack_t *ns); 146 static void rawip_stack_fini(netstackid_t stackid, void *arg); 147 148 static void *rawip_kstat_init(netstackid_t stackid); 149 static void rawip_kstat_fini(netstackid_t stackid, kstat_t *ksp); 150 static int rawip_kstat_update(kstat_t *kp, int rw); 151 static void rawip_stack_shutdown(netstackid_t stackid, void *arg); 152 153 /* Common routines for TPI and socket module */ 154 static conn_t *rawip_do_open(int, cred_t *, int *, int); 155 static void rawip_do_close(conn_t *); 156 static int rawip_do_bind(conn_t *, struct sockaddr *, socklen_t); 157 static int rawip_do_unbind(conn_t *); 158 static int rawip_do_connect(conn_t *, const struct sockaddr *, socklen_t, 159 cred_t *, pid_t); 160 161 int rawip_getsockname(sock_lower_handle_t, struct sockaddr *, 162 socklen_t *, cred_t *); 163 int rawip_getpeername(sock_lower_handle_t, struct sockaddr *, 164 socklen_t *, cred_t *); 165 166 static struct module_info icmp_mod_info = { 167 5707, "icmp", 1, INFPSZ, 512, 128 168 }; 169 170 /* 171 * Entry points for ICMP as a device. 172 * We have separate open functions for the /dev/icmp and /dev/icmp6 devices. 173 */ 174 static struct qinit icmprinitv4 = { 175 NULL, NULL, icmp_openv4, icmp_close, NULL, &icmp_mod_info 176 }; 177 178 static struct qinit icmprinitv6 = { 179 NULL, NULL, icmp_openv6, icmp_close, NULL, &icmp_mod_info 180 }; 181 182 static struct qinit icmpwinit = { 183 (pfi_t)icmp_wput, (pfi_t)ip_wsrv, NULL, NULL, NULL, &icmp_mod_info 184 }; 185 186 /* ICMP entry point during fallback */ 187 static struct qinit icmp_fallback_sock_winit = { 188 (pfi_t)icmp_wput_fallback, NULL, NULL, NULL, NULL, &icmp_mod_info 189 }; 190 191 /* For AF_INET aka /dev/icmp */ 192 struct streamtab icmpinfov4 = { 193 &icmprinitv4, &icmpwinit 194 }; 195 196 /* For AF_INET6 aka /dev/icmp6 */ 197 struct streamtab icmpinfov6 = { 198 &icmprinitv6, &icmpwinit 199 }; 200 201 /* Default structure copied into T_INFO_ACK messages */ 202 static struct T_info_ack icmp_g_t_info_ack = { 203 T_INFO_ACK, 204 IP_MAXPACKET, /* TSDU_size. icmp allows maximum size messages. */ 205 T_INVALID, /* ETSDU_size. icmp does not support expedited data. */ 206 T_INVALID, /* CDATA_size. icmp does not support connect data. */ 207 T_INVALID, /* DDATA_size. icmp does not support disconnect data. */ 208 0, /* ADDR_size - filled in later. */ 209 0, /* OPT_size - not initialized here */ 210 IP_MAXPACKET, /* TIDU_size. icmp allows maximum size messages. */ 211 T_CLTS, /* SERV_type. icmp supports connection-less. */ 212 TS_UNBND, /* CURRENT_state. This is set from icmp_state. */ 213 (XPG4_1|SENDZERO) /* PROVIDER_flag */ 214 }; 215 216 /* 217 * All of these are alterable, within the min/max values given, at run time. 218 * 219 * Note: All those tunables which do not start with "icmp_" are Committed and 220 * therefore are public. See PSARC 2010/080. 221 */ 222 static mod_prop_info_t icmp_propinfo_tbl[] = { 223 /* tunable - 0 */ 224 { "_wroff_extra", MOD_PROTO_RAWIP, 225 mod_set_uint32, mod_get_uint32, 226 {0, 128, 32}, {32} }, 227 228 { "_ipv4_ttl", MOD_PROTO_RAWIP, 229 mod_set_uint32, mod_get_uint32, 230 {1, 255, 255}, {255} }, 231 232 { "_ipv6_hoplimit", MOD_PROTO_RAWIP, 233 mod_set_uint32, mod_get_uint32, 234 {0, IPV6_MAX_HOPS, IPV6_DEFAULT_HOPS}, 235 {IPV6_DEFAULT_HOPS} }, 236 237 { "_bsd_compat", MOD_PROTO_RAWIP, 238 mod_set_boolean, mod_get_boolean, 239 {B_TRUE}, {B_TRUE} }, 240 241 { "send_maxbuf", MOD_PROTO_RAWIP, 242 mod_set_uint32, mod_get_uint32, 243 {4096, 65536, 8192}, {8192} }, 244 245 { "_xmit_lowat", MOD_PROTO_RAWIP, 246 mod_set_uint32, mod_get_uint32, 247 {0, 65536, 1024}, {1024} }, 248 249 { "recv_maxbuf", MOD_PROTO_RAWIP, 250 mod_set_uint32, mod_get_uint32, 251 {4096, 65536, 8192}, {8192} }, 252 253 { "_max_buf", MOD_PROTO_RAWIP, 254 mod_set_uint32, mod_get_uint32, 255 {65536, 1024*1024*1024, 256*1024}, {256 * 1024} }, 256 257 { "_pmtu_discovery", MOD_PROTO_RAWIP, 258 mod_set_boolean, mod_get_boolean, 259 {B_FALSE}, {B_FALSE} }, 260 261 { "_sendto_ignerr", MOD_PROTO_RAWIP, 262 mod_set_boolean, mod_get_boolean, 263 {B_FALSE}, {B_FALSE} }, 264 265 { "?", MOD_PROTO_RAWIP, NULL, mod_get_allprop, {0}, {0} }, 266 267 { NULL, 0, NULL, NULL, {0}, {0} } 268 }; 269 270 #define is_wroff_extra is_propinfo_tbl[0].prop_cur_uval 271 #define is_ipv4_ttl is_propinfo_tbl[1].prop_cur_uval 272 #define is_ipv6_hoplimit is_propinfo_tbl[2].prop_cur_uval 273 #define is_bsd_compat is_propinfo_tbl[3].prop_cur_bval 274 #define is_xmit_hiwat is_propinfo_tbl[4].prop_cur_uval 275 #define is_xmit_lowat is_propinfo_tbl[5].prop_cur_uval 276 #define is_recv_hiwat is_propinfo_tbl[6].prop_cur_uval 277 #define is_max_buf is_propinfo_tbl[7].prop_cur_uval 278 #define is_pmtu_discovery is_propinfo_tbl[8].prop_cur_bval 279 #define is_sendto_ignerr is_propinfo_tbl[9].prop_cur_bval 280 281 typedef union T_primitives *t_primp_t; 282 283 /* 284 * This routine is called to handle each O_T_BIND_REQ/T_BIND_REQ message 285 * passed to icmp_wput. 286 * It calls IP to verify the local IP address, and calls IP to insert 287 * the conn_t in the fanout table. 288 * If everything is ok it then sends the T_BIND_ACK back up. 289 */ 290 static void 291 icmp_tpi_bind(queue_t *q, mblk_t *mp) 292 { 293 int error; 294 struct sockaddr *sa; 295 struct T_bind_req *tbr; 296 socklen_t len; 297 sin_t *sin; 298 sin6_t *sin6; 299 icmp_t *icmp; 300 conn_t *connp = Q_TO_CONN(q); 301 mblk_t *mp1; 302 cred_t *cr; 303 304 /* 305 * All Solaris components should pass a db_credp 306 * for this TPI message, hence we ASSERT. 307 * But in case there is some other M_PROTO that looks 308 * like a TPI message sent by some other kernel 309 * component, we check and return an error. 310 */ 311 cr = msg_getcred(mp, NULL); 312 ASSERT(cr != NULL); 313 if (cr == NULL) { 314 icmp_err_ack(q, mp, TSYSERR, EINVAL); 315 return; 316 } 317 318 icmp = connp->conn_icmp; 319 if ((mp->b_wptr - mp->b_rptr) < sizeof (*tbr)) { 320 (void) mi_strlog(q, 1, SL_ERROR|SL_TRACE, 321 "icmp_bind: bad req, len %u", 322 (uint_t)(mp->b_wptr - mp->b_rptr)); 323 icmp_err_ack(q, mp, TPROTO, 0); 324 return; 325 } 326 327 if (icmp->icmp_state != TS_UNBND) { 328 (void) mi_strlog(q, 1, SL_ERROR|SL_TRACE, 329 "icmp_bind: bad state, %u", icmp->icmp_state); 330 icmp_err_ack(q, mp, TOUTSTATE, 0); 331 return; 332 } 333 334 /* 335 * Reallocate the message to make sure we have enough room for an 336 * address. 337 */ 338 mp1 = reallocb(mp, sizeof (struct T_bind_ack) + sizeof (sin6_t), 1); 339 if (mp1 == NULL) { 340 icmp_err_ack(q, mp, TSYSERR, ENOMEM); 341 return; 342 } 343 mp = mp1; 344 345 /* Reset the message type in preparation for shipping it back. */ 346 DB_TYPE(mp) = M_PCPROTO; 347 tbr = (struct T_bind_req *)mp->b_rptr; 348 len = tbr->ADDR_length; 349 switch (len) { 350 case 0: /* request for a generic port */ 351 tbr->ADDR_offset = sizeof (struct T_bind_req); 352 if (connp->conn_family == AF_INET) { 353 tbr->ADDR_length = sizeof (sin_t); 354 sin = (sin_t *)&tbr[1]; 355 *sin = sin_null; 356 sin->sin_family = AF_INET; 357 mp->b_wptr = (uchar_t *)&sin[1]; 358 sa = (struct sockaddr *)sin; 359 len = sizeof (sin_t); 360 } else { 361 ASSERT(connp->conn_family == AF_INET6); 362 tbr->ADDR_length = sizeof (sin6_t); 363 sin6 = (sin6_t *)&tbr[1]; 364 *sin6 = sin6_null; 365 sin6->sin6_family = AF_INET6; 366 mp->b_wptr = (uchar_t *)&sin6[1]; 367 sa = (struct sockaddr *)sin6; 368 len = sizeof (sin6_t); 369 } 370 break; 371 372 case sizeof (sin_t): /* Complete IPv4 address */ 373 sa = (struct sockaddr *)mi_offset_param(mp, tbr->ADDR_offset, 374 sizeof (sin_t)); 375 break; 376 377 case sizeof (sin6_t): /* Complete IPv6 address */ 378 sa = (struct sockaddr *)mi_offset_param(mp, 379 tbr->ADDR_offset, sizeof (sin6_t)); 380 break; 381 382 default: 383 (void) mi_strlog(q, 1, SL_ERROR|SL_TRACE, 384 "icmp_bind: bad ADDR_length %u", tbr->ADDR_length); 385 icmp_err_ack(q, mp, TBADADDR, 0); 386 return; 387 } 388 389 error = rawip_do_bind(connp, sa, len); 390 if (error != 0) { 391 if (error > 0) { 392 icmp_err_ack(q, mp, TSYSERR, error); 393 } else { 394 icmp_err_ack(q, mp, -error, 0); 395 } 396 } else { 397 tbr->PRIM_type = T_BIND_ACK; 398 qreply(q, mp); 399 } 400 } 401 402 static int 403 rawip_do_bind(conn_t *connp, struct sockaddr *sa, socklen_t len) 404 { 405 sin_t *sin; 406 sin6_t *sin6; 407 icmp_t *icmp = connp->conn_icmp; 408 int error = 0; 409 ip_laddr_t laddr_type = IPVL_UNICAST_UP; /* INADDR_ANY */ 410 in_port_t lport; /* Network byte order */ 411 ipaddr_t v4src; /* Set if AF_INET */ 412 in6_addr_t v6src; 413 uint_t scopeid = 0; 414 zoneid_t zoneid = IPCL_ZONEID(connp); 415 ip_stack_t *ipst = connp->conn_netstack->netstack_ip; 416 417 if (sa == NULL || !OK_32PTR((char *)sa)) { 418 return (EINVAL); 419 } 420 421 switch (len) { 422 case sizeof (sin_t): /* Complete IPv4 address */ 423 sin = (sin_t *)sa; 424 if (sin->sin_family != AF_INET || 425 connp->conn_family != AF_INET) { 426 /* TSYSERR, EAFNOSUPPORT */ 427 return (EAFNOSUPPORT); 428 } 429 v4src = sin->sin_addr.s_addr; 430 IN6_IPADDR_TO_V4MAPPED(v4src, &v6src); 431 if (v4src != INADDR_ANY) { 432 laddr_type = ip_laddr_verify_v4(v4src, zoneid, ipst, 433 B_TRUE); 434 } 435 lport = sin->sin_port; 436 break; 437 case sizeof (sin6_t): /* Complete IPv6 address */ 438 sin6 = (sin6_t *)sa; 439 if (sin6->sin6_family != AF_INET6 || 440 connp->conn_family != AF_INET6) { 441 /* TSYSERR, EAFNOSUPPORT */ 442 return (EAFNOSUPPORT); 443 } 444 /* No support for mapped addresses on raw sockets */ 445 if (IN6_IS_ADDR_V4MAPPED(&sin6->sin6_addr)) { 446 /* TSYSERR, EADDRNOTAVAIL */ 447 return (EADDRNOTAVAIL); 448 } 449 v6src = sin6->sin6_addr; 450 if (!IN6_IS_ADDR_UNSPECIFIED(&v6src)) { 451 if (IN6_IS_ADDR_LINKSCOPE(&v6src)) 452 scopeid = sin6->sin6_scope_id; 453 laddr_type = ip_laddr_verify_v6(&v6src, zoneid, ipst, 454 B_TRUE, scopeid); 455 } 456 lport = sin6->sin6_port; 457 break; 458 459 default: 460 /* TBADADDR */ 461 return (EADDRNOTAVAIL); 462 } 463 464 /* Is the local address a valid unicast, multicast, or broadcast? */ 465 if (laddr_type == IPVL_BAD) 466 return (EADDRNOTAVAIL); 467 468 /* 469 * The state must be TS_UNBND. 470 */ 471 mutex_enter(&connp->conn_lock); 472 if (icmp->icmp_state != TS_UNBND) { 473 mutex_exit(&connp->conn_lock); 474 return (-TOUTSTATE); 475 } 476 477 /* 478 * Copy the source address into our icmp structure. This address 479 * may still be zero; if so, ip will fill in the correct address 480 * each time an outbound packet is passed to it. 481 * If we are binding to a broadcast or multicast address then 482 * we just set the conn_bound_addr since we don't want to use 483 * that as the source address when sending. 484 */ 485 connp->conn_bound_addr_v6 = v6src; 486 connp->conn_laddr_v6 = v6src; 487 if (scopeid != 0) { 488 connp->conn_ixa->ixa_flags |= IXAF_SCOPEID_SET; 489 connp->conn_ixa->ixa_scopeid = scopeid; 490 connp->conn_incoming_ifindex = scopeid; 491 } else { 492 connp->conn_ixa->ixa_flags &= ~IXAF_SCOPEID_SET; 493 connp->conn_incoming_ifindex = connp->conn_bound_if; 494 } 495 496 switch (laddr_type) { 497 case IPVL_UNICAST_UP: 498 case IPVL_UNICAST_DOWN: 499 connp->conn_saddr_v6 = v6src; 500 connp->conn_mcbc_bind = B_FALSE; 501 break; 502 case IPVL_MCAST: 503 case IPVL_BCAST: 504 /* ip_set_destination will pick a source address later */ 505 connp->conn_saddr_v6 = ipv6_all_zeros; 506 connp->conn_mcbc_bind = B_TRUE; 507 break; 508 } 509 510 /* Any errors after this point should use late_error */ 511 512 /* 513 * Use sin_port/sin6_port since applications like psh use SOCK_RAW 514 * with IPPROTO_TCP. 515 */ 516 connp->conn_lport = lport; 517 connp->conn_fport = 0; 518 519 if (connp->conn_family == AF_INET) { 520 ASSERT(connp->conn_ipversion == IPV4_VERSION); 521 } else { 522 ASSERT(connp->conn_ipversion == IPV6_VERSION); 523 } 524 525 icmp->icmp_state = TS_IDLE; 526 527 /* 528 * We create an initial header template here to make a subsequent 529 * sendto have a starting point. Since conn_last_dst is zero the 530 * first sendto will always follow the 'dst changed' code path. 531 * Note that we defer massaging options and the related checksum 532 * adjustment until we have a destination address. 533 */ 534 error = icmp_build_hdr_template(connp, &connp->conn_saddr_v6, 535 &connp->conn_faddr_v6, connp->conn_flowinfo); 536 if (error != 0) { 537 mutex_exit(&connp->conn_lock); 538 goto late_error; 539 } 540 /* Just in case */ 541 connp->conn_faddr_v6 = ipv6_all_zeros; 542 connp->conn_v6lastdst = ipv6_all_zeros; 543 mutex_exit(&connp->conn_lock); 544 545 error = ip_laddr_fanout_insert(connp); 546 if (error != 0) 547 goto late_error; 548 549 /* Bind succeeded */ 550 return (0); 551 552 late_error: 553 mutex_enter(&connp->conn_lock); 554 connp->conn_saddr_v6 = ipv6_all_zeros; 555 connp->conn_bound_addr_v6 = ipv6_all_zeros; 556 connp->conn_laddr_v6 = ipv6_all_zeros; 557 if (scopeid != 0) { 558 connp->conn_ixa->ixa_flags &= ~IXAF_SCOPEID_SET; 559 connp->conn_incoming_ifindex = connp->conn_bound_if; 560 } 561 icmp->icmp_state = TS_UNBND; 562 connp->conn_v6lastdst = ipv6_all_zeros; 563 connp->conn_lport = 0; 564 565 /* Restore the header that was built above - different source address */ 566 (void) icmp_build_hdr_template(connp, &connp->conn_saddr_v6, 567 &connp->conn_faddr_v6, connp->conn_flowinfo); 568 mutex_exit(&connp->conn_lock); 569 return (error); 570 } 571 572 /* 573 * Tell IP to just bind to the protocol. 574 */ 575 static void 576 icmp_bind_proto(icmp_t *icmp) 577 { 578 conn_t *connp = icmp->icmp_connp; 579 580 mutex_enter(&connp->conn_lock); 581 connp->conn_saddr_v6 = ipv6_all_zeros; 582 connp->conn_laddr_v6 = ipv6_all_zeros; 583 connp->conn_faddr_v6 = ipv6_all_zeros; 584 connp->conn_v6lastdst = ipv6_all_zeros; 585 mutex_exit(&connp->conn_lock); 586 587 (void) ip_laddr_fanout_insert(connp); 588 } 589 590 /* 591 * This routine handles each T_CONN_REQ message passed to icmp. It 592 * associates a default destination address with the stream. 593 * 594 * After various error checks are completed, icmp_connect() lays 595 * the target address and port into the composite header template. 596 * Then we ask IP for information, including a source address if we didn't 597 * already have one. Finally we send up the T_OK_ACK reply message. 598 */ 599 static void 600 icmp_tpi_connect(queue_t *q, mblk_t *mp) 601 { 602 conn_t *connp = Q_TO_CONN(q); 603 struct T_conn_req *tcr; 604 struct sockaddr *sa; 605 socklen_t len; 606 int error; 607 cred_t *cr; 608 pid_t pid; 609 /* 610 * All Solaris components should pass a db_credp 611 * for this TPI message, hence we ASSERT. 612 * But in case there is some other M_PROTO that looks 613 * like a TPI message sent by some other kernel 614 * component, we check and return an error. 615 */ 616 cr = msg_getcred(mp, &pid); 617 ASSERT(cr != NULL); 618 if (cr == NULL) { 619 icmp_err_ack(q, mp, TSYSERR, EINVAL); 620 return; 621 } 622 623 tcr = (struct T_conn_req *)mp->b_rptr; 624 /* Sanity checks */ 625 if ((mp->b_wptr - mp->b_rptr) < sizeof (struct T_conn_req)) { 626 icmp_err_ack(q, mp, TPROTO, 0); 627 return; 628 } 629 630 if (tcr->OPT_length != 0) { 631 icmp_err_ack(q, mp, TBADOPT, 0); 632 return; 633 } 634 635 len = tcr->DEST_length; 636 637 switch (len) { 638 default: 639 icmp_err_ack(q, mp, TBADADDR, 0); 640 return; 641 case sizeof (sin_t): 642 sa = (struct sockaddr *)mi_offset_param(mp, tcr->DEST_offset, 643 sizeof (sin_t)); 644 break; 645 case sizeof (sin6_t): 646 sa = (struct sockaddr *)mi_offset_param(mp, 647 tcr->DEST_offset, sizeof (sin6_t)); 648 break; 649 } 650 651 error = proto_verify_ip_addr(connp->conn_family, sa, len); 652 if (error != 0) { 653 icmp_err_ack(q, mp, TSYSERR, error); 654 return; 655 } 656 657 error = rawip_do_connect(connp, sa, len, cr, pid); 658 if (error != 0) { 659 if (error < 0) { 660 icmp_err_ack(q, mp, -error, 0); 661 } else { 662 icmp_err_ack(q, mp, 0, error); 663 } 664 } else { 665 mblk_t *mp1; 666 667 /* 668 * We have to send a connection confirmation to 669 * keep TLI happy. 670 */ 671 if (connp->conn_family == AF_INET) { 672 mp1 = mi_tpi_conn_con(NULL, (char *)sa, 673 sizeof (sin_t), NULL, 0); 674 } else { 675 ASSERT(connp->conn_family == AF_INET6); 676 mp1 = mi_tpi_conn_con(NULL, (char *)sa, 677 sizeof (sin6_t), NULL, 0); 678 } 679 if (mp1 == NULL) { 680 icmp_err_ack(q, mp, TSYSERR, ENOMEM); 681 return; 682 } 683 684 /* 685 * Send ok_ack for T_CONN_REQ 686 */ 687 mp = mi_tpi_ok_ack_alloc(mp); 688 if (mp == NULL) { 689 /* Unable to reuse the T_CONN_REQ for the ack. */ 690 icmp_err_ack_prim(q, mp1, T_CONN_REQ, TSYSERR, ENOMEM); 691 return; 692 } 693 putnext(connp->conn_rq, mp); 694 putnext(connp->conn_rq, mp1); 695 } 696 } 697 698 static int 699 rawip_do_connect(conn_t *connp, const struct sockaddr *sa, socklen_t len, 700 cred_t *cr, pid_t pid) 701 { 702 icmp_t *icmp; 703 sin_t *sin; 704 sin6_t *sin6; 705 int error; 706 uint16_t dstport; 707 ipaddr_t v4dst; 708 in6_addr_t v6dst; 709 uint32_t flowinfo; 710 ip_xmit_attr_t *ixa; 711 ip_xmit_attr_t *oldixa; 712 uint_t scopeid = 0; 713 uint_t srcid = 0; 714 in6_addr_t v6src = connp->conn_saddr_v6; 715 716 icmp = connp->conn_icmp; 717 718 if (sa == NULL || !OK_32PTR((char *)sa)) { 719 return (EINVAL); 720 } 721 722 ASSERT(sa != NULL && len != 0); 723 724 /* 725 * Determine packet type based on type of address passed in 726 * the request should contain an IPv4 or IPv6 address. 727 * Make sure that address family matches the type of 728 * family of the address passed down. 729 */ 730 switch (len) { 731 case sizeof (sin_t): 732 sin = (sin_t *)sa; 733 734 v4dst = sin->sin_addr.s_addr; 735 dstport = sin->sin_port; 736 IN6_IPADDR_TO_V4MAPPED(v4dst, &v6dst); 737 ASSERT(connp->conn_ipversion == IPV4_VERSION); 738 break; 739 740 case sizeof (sin6_t): 741 sin6 = (sin6_t *)sa; 742 743 /* No support for mapped addresses on raw sockets */ 744 if (IN6_IS_ADDR_V4MAPPED(&sin6->sin6_addr)) { 745 return (EADDRNOTAVAIL); 746 } 747 v6dst = sin6->sin6_addr; 748 dstport = sin6->sin6_port; 749 ASSERT(connp->conn_ipversion == IPV6_VERSION); 750 flowinfo = sin6->sin6_flowinfo; 751 if (IN6_IS_ADDR_LINKLOCAL(&sin6->sin6_addr)) 752 scopeid = sin6->sin6_scope_id; 753 srcid = sin6->__sin6_src_id; 754 if (srcid != 0 && IN6_IS_ADDR_UNSPECIFIED(&v6src)) { 755 ip_srcid_find_id(srcid, &v6src, IPCL_ZONEID(connp), 756 connp->conn_netstack); 757 } 758 break; 759 } 760 761 /* 762 * If there is a different thread using conn_ixa then we get a new 763 * copy and cut the old one loose from conn_ixa. Otherwise we use 764 * conn_ixa and prevent any other thread from using/changing it. 765 * Once connect() is done other threads can use conn_ixa since the 766 * refcnt will be back at one. 767 * We defer updating conn_ixa until later to handle any concurrent 768 * conn_ixa_cleanup thread. 769 */ 770 ixa = conn_get_ixa(connp, B_FALSE); 771 if (ixa == NULL) 772 return (ENOMEM); 773 774 mutex_enter(&connp->conn_lock); 775 /* 776 * This icmp_t must have bound already before doing a connect. 777 * Reject if a connect is in progress (we drop conn_lock during 778 * rawip_do_connect). 779 */ 780 if (icmp->icmp_state == TS_UNBND || icmp->icmp_state == TS_WCON_CREQ) { 781 mutex_exit(&connp->conn_lock); 782 ixa_refrele(ixa); 783 return (-TOUTSTATE); 784 } 785 786 if (icmp->icmp_state == TS_DATA_XFER) { 787 /* Already connected - clear out state */ 788 if (connp->conn_mcbc_bind) 789 connp->conn_saddr_v6 = ipv6_all_zeros; 790 else 791 connp->conn_saddr_v6 = connp->conn_bound_addr_v6; 792 connp->conn_laddr_v6 = connp->conn_bound_addr_v6; 793 connp->conn_faddr_v6 = ipv6_all_zeros; 794 icmp->icmp_state = TS_IDLE; 795 } 796 797 /* 798 * Use sin_port/sin6_port since applications like psh use SOCK_RAW 799 * with IPPROTO_TCP. 800 */ 801 connp->conn_fport = dstport; 802 if (connp->conn_ipversion == IPV4_VERSION) { 803 /* 804 * Interpret a zero destination to mean loopback. 805 * Update the T_CONN_REQ (sin/sin6) since it is used to 806 * generate the T_CONN_CON. 807 */ 808 if (v4dst == INADDR_ANY) { 809 v4dst = htonl(INADDR_LOOPBACK); 810 IN6_IPADDR_TO_V4MAPPED(v4dst, &v6dst); 811 ASSERT(connp->conn_family == AF_INET); 812 sin->sin_addr.s_addr = v4dst; 813 } 814 connp->conn_faddr_v6 = v6dst; 815 connp->conn_flowinfo = 0; 816 } else { 817 ASSERT(connp->conn_ipversion == IPV6_VERSION); 818 /* 819 * Interpret a zero destination to mean loopback. 820 * Update the T_CONN_REQ (sin/sin6) since it is used to 821 * generate the T_CONN_CON. 822 */ 823 if (IN6_IS_ADDR_UNSPECIFIED(&v6dst)) { 824 v6dst = ipv6_loopback; 825 sin6->sin6_addr = v6dst; 826 } 827 connp->conn_faddr_v6 = v6dst; 828 connp->conn_flowinfo = flowinfo; 829 } 830 831 /* 832 * We update our cred/cpid based on the caller of connect 833 */ 834 if (connp->conn_cred != cr) { 835 crhold(cr); 836 crfree(connp->conn_cred); 837 connp->conn_cred = cr; 838 } 839 connp->conn_cpid = pid; 840 ASSERT(!(ixa->ixa_free_flags & IXA_FREE_CRED)); 841 ixa->ixa_cred = cr; 842 ixa->ixa_cpid = pid; 843 if (is_system_labeled()) { 844 /* We need to restart with a label based on the cred */ 845 ip_xmit_attr_restore_tsl(ixa, ixa->ixa_cred); 846 } 847 848 if (scopeid != 0) { 849 ixa->ixa_flags |= IXAF_SCOPEID_SET; 850 ixa->ixa_scopeid = scopeid; 851 connp->conn_incoming_ifindex = scopeid; 852 } else { 853 ixa->ixa_flags &= ~IXAF_SCOPEID_SET; 854 connp->conn_incoming_ifindex = connp->conn_bound_if; 855 } 856 857 /* 858 * conn_connect will drop conn_lock and reacquire it. 859 * To prevent a send* from messing with this icmp_t while the lock 860 * is dropped we set icmp_state and clear conn_v6lastdst. 861 * That will make all send* fail with EISCONN. 862 */ 863 connp->conn_v6lastdst = ipv6_all_zeros; 864 icmp->icmp_state = TS_WCON_CREQ; 865 866 error = conn_connect(connp, NULL, IPDF_ALLOW_MCBC); 867 mutex_exit(&connp->conn_lock); 868 if (error != 0) 869 goto connect_failed; 870 871 /* 872 * The addresses have been verified. Time to insert in 873 * the correct fanout list. 874 */ 875 error = ipcl_conn_insert(connp); 876 if (error != 0) 877 goto connect_failed; 878 879 mutex_enter(&connp->conn_lock); 880 error = icmp_build_hdr_template(connp, &connp->conn_saddr_v6, 881 &connp->conn_faddr_v6, connp->conn_flowinfo); 882 if (error != 0) { 883 mutex_exit(&connp->conn_lock); 884 goto connect_failed; 885 } 886 887 icmp->icmp_state = TS_DATA_XFER; 888 /* Record this as the "last" send even though we haven't sent any */ 889 connp->conn_v6lastdst = connp->conn_faddr_v6; 890 connp->conn_lastipversion = connp->conn_ipversion; 891 connp->conn_lastdstport = connp->conn_fport; 892 connp->conn_lastflowinfo = connp->conn_flowinfo; 893 connp->conn_lastscopeid = scopeid; 894 connp->conn_lastsrcid = srcid; 895 /* Also remember a source to use together with lastdst */ 896 connp->conn_v6lastsrc = v6src; 897 898 oldixa = conn_replace_ixa(connp, ixa); 899 mutex_exit(&connp->conn_lock); 900 ixa_refrele(oldixa); 901 902 ixa_refrele(ixa); 903 return (0); 904 905 connect_failed: 906 if (ixa != NULL) 907 ixa_refrele(ixa); 908 mutex_enter(&connp->conn_lock); 909 icmp->icmp_state = TS_IDLE; 910 /* In case the source address was set above */ 911 if (connp->conn_mcbc_bind) 912 connp->conn_saddr_v6 = ipv6_all_zeros; 913 else 914 connp->conn_saddr_v6 = connp->conn_bound_addr_v6; 915 connp->conn_laddr_v6 = connp->conn_bound_addr_v6; 916 connp->conn_faddr_v6 = ipv6_all_zeros; 917 connp->conn_v6lastdst = ipv6_all_zeros; 918 connp->conn_flowinfo = 0; 919 920 (void) icmp_build_hdr_template(connp, &connp->conn_saddr_v6, 921 &connp->conn_faddr_v6, connp->conn_flowinfo); 922 mutex_exit(&connp->conn_lock); 923 return (error); 924 } 925 926 static void 927 rawip_do_close(conn_t *connp) 928 { 929 ASSERT(connp != NULL && IPCL_IS_RAWIP(connp)); 930 931 ip_quiesce_conn(connp); 932 933 if (!IPCL_IS_NONSTR(connp)) { 934 qprocsoff(connp->conn_rq); 935 } 936 937 icmp_close_free(connp); 938 939 /* 940 * Now we are truly single threaded on this stream, and can 941 * delete the things hanging off the connp, and finally the connp. 942 * We removed this connp from the fanout list, it cannot be 943 * accessed thru the fanouts, and we already waited for the 944 * conn_ref to drop to 0. We are already in close, so 945 * there cannot be any other thread from the top. qprocsoff 946 * has completed, and service has completed or won't run in 947 * future. 948 */ 949 ASSERT(connp->conn_ref == 1); 950 951 if (!IPCL_IS_NONSTR(connp)) { 952 inet_minor_free(connp->conn_minor_arena, connp->conn_dev); 953 } else { 954 ip_free_helper_stream(connp); 955 } 956 957 connp->conn_ref--; 958 ipcl_conn_destroy(connp); 959 } 960 961 static int 962 icmp_close(queue_t *q, int flags) 963 { 964 conn_t *connp; 965 966 if (flags & SO_FALLBACK) { 967 /* 968 * stream is being closed while in fallback 969 * simply free the resources that were allocated 970 */ 971 inet_minor_free(WR(q)->q_ptr, (dev_t)(RD(q)->q_ptr)); 972 qprocsoff(q); 973 goto done; 974 } 975 976 connp = Q_TO_CONN(q); 977 (void) rawip_do_close(connp); 978 done: 979 q->q_ptr = WR(q)->q_ptr = NULL; 980 return (0); 981 } 982 983 static void 984 icmp_close_free(conn_t *connp) 985 { 986 icmp_t *icmp = connp->conn_icmp; 987 988 if (icmp->icmp_filter != NULL) { 989 kmem_free(icmp->icmp_filter, sizeof (icmp6_filter_t)); 990 icmp->icmp_filter = NULL; 991 } 992 993 /* 994 * Clear any fields which the kmem_cache constructor clears. 995 * Only icmp_connp needs to be preserved. 996 * TBD: We should make this more efficient to avoid clearing 997 * everything. 998 */ 999 ASSERT(icmp->icmp_connp == connp); 1000 bzero(icmp, sizeof (icmp_t)); 1001 icmp->icmp_connp = connp; 1002 } 1003 1004 /* 1005 * This routine handles each T_DISCON_REQ message passed to icmp 1006 * as an indicating that ICMP is no longer connected. This results 1007 * in telling IP to restore the binding to just the local address. 1008 */ 1009 static int 1010 icmp_do_disconnect(conn_t *connp) 1011 { 1012 icmp_t *icmp = connp->conn_icmp; 1013 int error; 1014 1015 mutex_enter(&connp->conn_lock); 1016 if (icmp->icmp_state != TS_DATA_XFER) { 1017 mutex_exit(&connp->conn_lock); 1018 return (-TOUTSTATE); 1019 } 1020 if (connp->conn_mcbc_bind) 1021 connp->conn_saddr_v6 = ipv6_all_zeros; 1022 else 1023 connp->conn_saddr_v6 = connp->conn_bound_addr_v6; 1024 connp->conn_laddr_v6 = connp->conn_bound_addr_v6; 1025 connp->conn_faddr_v6 = ipv6_all_zeros; 1026 icmp->icmp_state = TS_IDLE; 1027 1028 connp->conn_v6lastdst = ipv6_all_zeros; 1029 error = icmp_build_hdr_template(connp, &connp->conn_saddr_v6, 1030 &connp->conn_faddr_v6, connp->conn_flowinfo); 1031 mutex_exit(&connp->conn_lock); 1032 if (error != 0) 1033 return (error); 1034 1035 /* 1036 * Tell IP to remove the full binding and revert 1037 * to the local address binding. 1038 */ 1039 return (ip_laddr_fanout_insert(connp)); 1040 } 1041 1042 static void 1043 icmp_tpi_disconnect(queue_t *q, mblk_t *mp) 1044 { 1045 conn_t *connp = Q_TO_CONN(q); 1046 int error; 1047 1048 /* 1049 * Allocate the largest primitive we need to send back 1050 * T_error_ack is > than T_ok_ack 1051 */ 1052 mp = reallocb(mp, sizeof (struct T_error_ack), 1); 1053 if (mp == NULL) { 1054 /* Unable to reuse the T_DISCON_REQ for the ack. */ 1055 icmp_err_ack_prim(q, mp, T_DISCON_REQ, TSYSERR, ENOMEM); 1056 return; 1057 } 1058 1059 error = icmp_do_disconnect(connp); 1060 1061 if (error != 0) { 1062 if (error > 0) { 1063 icmp_err_ack(q, mp, 0, error); 1064 } else { 1065 icmp_err_ack(q, mp, -error, 0); 1066 } 1067 } else { 1068 mp = mi_tpi_ok_ack_alloc(mp); 1069 ASSERT(mp != NULL); 1070 qreply(q, mp); 1071 } 1072 } 1073 1074 static int 1075 icmp_disconnect(conn_t *connp) 1076 { 1077 int error; 1078 1079 connp->conn_dgram_errind = B_FALSE; 1080 1081 error = icmp_do_disconnect(connp); 1082 1083 if (error < 0) 1084 error = proto_tlitosyserr(-error); 1085 return (error); 1086 } 1087 1088 /* This routine creates a T_ERROR_ACK message and passes it upstream. */ 1089 static void 1090 icmp_err_ack(queue_t *q, mblk_t *mp, t_scalar_t t_error, int sys_error) 1091 { 1092 if ((mp = mi_tpi_err_ack_alloc(mp, t_error, sys_error)) != NULL) 1093 qreply(q, mp); 1094 } 1095 1096 /* Shorthand to generate and send TPI error acks to our client */ 1097 static void 1098 icmp_err_ack_prim(queue_t *q, mblk_t *mp, t_scalar_t primitive, 1099 t_scalar_t t_error, int sys_error) 1100 { 1101 struct T_error_ack *teackp; 1102 1103 if ((mp = tpi_ack_alloc(mp, sizeof (struct T_error_ack), 1104 M_PCPROTO, T_ERROR_ACK)) != NULL) { 1105 teackp = (struct T_error_ack *)mp->b_rptr; 1106 teackp->ERROR_prim = primitive; 1107 teackp->TLI_error = t_error; 1108 teackp->UNIX_error = sys_error; 1109 qreply(q, mp); 1110 } 1111 } 1112 1113 /* 1114 * icmp_icmp_input is called as conn_recvicmp to process ICMP messages. 1115 * Generates the appropriate T_UDERROR_IND for permanent (non-transient) errors. 1116 * Assumes that IP has pulled up everything up to and including the ICMP header. 1117 */ 1118 /* ARGSUSED2 */ 1119 static void 1120 icmp_icmp_input(void *arg1, mblk_t *mp, void *arg2, ip_recv_attr_t *ira) 1121 { 1122 conn_t *connp = (conn_t *)arg1; 1123 icmp_t *icmp = connp->conn_icmp; 1124 icmph_t *icmph; 1125 ipha_t *ipha; 1126 int iph_hdr_length; 1127 sin_t sin; 1128 mblk_t *mp1; 1129 int error = 0; 1130 1131 ipha = (ipha_t *)mp->b_rptr; 1132 1133 ASSERT(OK_32PTR(mp->b_rptr)); 1134 1135 if (IPH_HDR_VERSION(ipha) != IPV4_VERSION) { 1136 ASSERT(IPH_HDR_VERSION(ipha) == IPV6_VERSION); 1137 icmp_icmp_error_ipv6(connp, mp, ira); 1138 return; 1139 } 1140 ASSERT(IPH_HDR_VERSION(ipha) == IPV4_VERSION); 1141 1142 /* Skip past the outer IP and ICMP headers */ 1143 ASSERT(IPH_HDR_LENGTH(ipha) == ira->ira_ip_hdr_length); 1144 iph_hdr_length = ira->ira_ip_hdr_length; 1145 icmph = (icmph_t *)&mp->b_rptr[iph_hdr_length]; 1146 ipha = (ipha_t *)&icmph[1]; /* Inner IP header */ 1147 1148 iph_hdr_length = IPH_HDR_LENGTH(ipha); 1149 1150 switch (icmph->icmph_type) { 1151 case ICMP_DEST_UNREACHABLE: 1152 switch (icmph->icmph_code) { 1153 case ICMP_FRAGMENTATION_NEEDED: { 1154 ipha_t *ipha; 1155 ip_xmit_attr_t *ixa; 1156 /* 1157 * IP has already adjusted the path MTU. 1158 * But we need to adjust DF for IPv4. 1159 */ 1160 if (connp->conn_ipversion != IPV4_VERSION) 1161 break; 1162 1163 ixa = conn_get_ixa(connp, B_FALSE); 1164 if (ixa == NULL || ixa->ixa_ire == NULL) { 1165 /* 1166 * Some other thread holds conn_ixa. We will 1167 * redo this on the next ICMP too big. 1168 */ 1169 if (ixa != NULL) 1170 ixa_refrele(ixa); 1171 break; 1172 } 1173 (void) ip_get_pmtu(ixa); 1174 1175 mutex_enter(&connp->conn_lock); 1176 ipha = (ipha_t *)connp->conn_ht_iphc; 1177 if (ixa->ixa_flags & IXAF_PMTU_IPV4_DF) { 1178 ipha->ipha_fragment_offset_and_flags |= 1179 IPH_DF_HTONS; 1180 } else { 1181 ipha->ipha_fragment_offset_and_flags &= 1182 ~IPH_DF_HTONS; 1183 } 1184 mutex_exit(&connp->conn_lock); 1185 ixa_refrele(ixa); 1186 break; 1187 } 1188 case ICMP_PORT_UNREACHABLE: 1189 case ICMP_PROTOCOL_UNREACHABLE: 1190 error = ECONNREFUSED; 1191 break; 1192 default: 1193 /* Transient errors */ 1194 break; 1195 } 1196 break; 1197 default: 1198 /* Transient errors */ 1199 break; 1200 } 1201 if (error == 0) { 1202 freemsg(mp); 1203 return; 1204 } 1205 1206 /* 1207 * Deliver T_UDERROR_IND when the application has asked for it. 1208 * The socket layer enables this automatically when connected. 1209 */ 1210 if (!connp->conn_dgram_errind) { 1211 freemsg(mp); 1212 return; 1213 } 1214 1215 sin = sin_null; 1216 sin.sin_family = AF_INET; 1217 sin.sin_addr.s_addr = ipha->ipha_dst; 1218 1219 if (IPCL_IS_NONSTR(connp)) { 1220 mutex_enter(&connp->conn_lock); 1221 if (icmp->icmp_state == TS_DATA_XFER) { 1222 if (sin.sin_addr.s_addr == connp->conn_faddr_v4) { 1223 mutex_exit(&connp->conn_lock); 1224 (*connp->conn_upcalls->su_set_error) 1225 (connp->conn_upper_handle, error); 1226 goto done; 1227 } 1228 } else { 1229 icmp->icmp_delayed_error = error; 1230 *((sin_t *)&icmp->icmp_delayed_addr) = sin; 1231 } 1232 mutex_exit(&connp->conn_lock); 1233 } else { 1234 mp1 = mi_tpi_uderror_ind((char *)&sin, sizeof (sin_t), NULL, 0, 1235 error); 1236 if (mp1 != NULL) 1237 putnext(connp->conn_rq, mp1); 1238 } 1239 done: 1240 freemsg(mp); 1241 } 1242 1243 /* 1244 * icmp_icmp_error_ipv6 is called by icmp_icmp_error to process ICMP for IPv6. 1245 * Generates the appropriate T_UDERROR_IND for permanent (non-transient) errors. 1246 * Assumes that IP has pulled up all the extension headers as well as the 1247 * ICMPv6 header. 1248 */ 1249 static void 1250 icmp_icmp_error_ipv6(conn_t *connp, mblk_t *mp, ip_recv_attr_t *ira) 1251 { 1252 icmp6_t *icmp6; 1253 ip6_t *ip6h, *outer_ip6h; 1254 uint16_t iph_hdr_length; 1255 uint8_t *nexthdrp; 1256 sin6_t sin6; 1257 mblk_t *mp1; 1258 int error = 0; 1259 icmp_t *icmp = connp->conn_icmp; 1260 1261 outer_ip6h = (ip6_t *)mp->b_rptr; 1262 #ifdef DEBUG 1263 if (outer_ip6h->ip6_nxt != IPPROTO_ICMPV6) 1264 iph_hdr_length = ip_hdr_length_v6(mp, outer_ip6h); 1265 else 1266 iph_hdr_length = IPV6_HDR_LEN; 1267 ASSERT(iph_hdr_length == ira->ira_ip_hdr_length); 1268 #endif 1269 /* Skip past the outer IP and ICMP headers */ 1270 iph_hdr_length = ira->ira_ip_hdr_length; 1271 icmp6 = (icmp6_t *)&mp->b_rptr[iph_hdr_length]; 1272 1273 ip6h = (ip6_t *)&icmp6[1]; /* Inner IP header */ 1274 if (!ip_hdr_length_nexthdr_v6(mp, ip6h, &iph_hdr_length, &nexthdrp)) { 1275 freemsg(mp); 1276 return; 1277 } 1278 1279 switch (icmp6->icmp6_type) { 1280 case ICMP6_DST_UNREACH: 1281 switch (icmp6->icmp6_code) { 1282 case ICMP6_DST_UNREACH_NOPORT: 1283 error = ECONNREFUSED; 1284 break; 1285 case ICMP6_DST_UNREACH_ADMIN: 1286 case ICMP6_DST_UNREACH_NOROUTE: 1287 case ICMP6_DST_UNREACH_BEYONDSCOPE: 1288 case ICMP6_DST_UNREACH_ADDR: 1289 /* Transient errors */ 1290 break; 1291 default: 1292 break; 1293 } 1294 break; 1295 case ICMP6_PACKET_TOO_BIG: { 1296 struct T_unitdata_ind *tudi; 1297 struct T_opthdr *toh; 1298 size_t udi_size; 1299 mblk_t *newmp; 1300 t_scalar_t opt_length = sizeof (struct T_opthdr) + 1301 sizeof (struct ip6_mtuinfo); 1302 sin6_t *sin6; 1303 struct ip6_mtuinfo *mtuinfo; 1304 1305 /* 1306 * If the application has requested to receive path mtu 1307 * information, send up an empty message containing an 1308 * IPV6_PATHMTU ancillary data item. 1309 */ 1310 if (!connp->conn_ipv6_recvpathmtu) 1311 break; 1312 1313 udi_size = sizeof (struct T_unitdata_ind) + sizeof (sin6_t) + 1314 opt_length; 1315 if ((newmp = allocb(udi_size, BPRI_MED)) == NULL) { 1316 BUMP_MIB(&icmp->icmp_is->is_rawip_mib, rawipInErrors); 1317 break; 1318 } 1319 1320 /* 1321 * newmp->b_cont is left to NULL on purpose. This is an 1322 * empty message containing only ancillary data. 1323 */ 1324 newmp->b_datap->db_type = M_PROTO; 1325 tudi = (struct T_unitdata_ind *)newmp->b_rptr; 1326 newmp->b_wptr = (uchar_t *)tudi + udi_size; 1327 tudi->PRIM_type = T_UNITDATA_IND; 1328 tudi->SRC_length = sizeof (sin6_t); 1329 tudi->SRC_offset = sizeof (struct T_unitdata_ind); 1330 tudi->OPT_offset = tudi->SRC_offset + sizeof (sin6_t); 1331 tudi->OPT_length = opt_length; 1332 1333 sin6 = (sin6_t *)&tudi[1]; 1334 bzero(sin6, sizeof (sin6_t)); 1335 sin6->sin6_family = AF_INET6; 1336 sin6->sin6_addr = connp->conn_faddr_v6; 1337 1338 toh = (struct T_opthdr *)&sin6[1]; 1339 toh->level = IPPROTO_IPV6; 1340 toh->name = IPV6_PATHMTU; 1341 toh->len = opt_length; 1342 toh->status = 0; 1343 1344 mtuinfo = (struct ip6_mtuinfo *)&toh[1]; 1345 bzero(mtuinfo, sizeof (struct ip6_mtuinfo)); 1346 mtuinfo->ip6m_addr.sin6_family = AF_INET6; 1347 mtuinfo->ip6m_addr.sin6_addr = ip6h->ip6_dst; 1348 mtuinfo->ip6m_mtu = icmp6->icmp6_mtu; 1349 /* 1350 * We've consumed everything we need from the original 1351 * message. Free it, then send our empty message. 1352 */ 1353 freemsg(mp); 1354 icmp_ulp_recv(connp, newmp, msgdsize(newmp)); 1355 return; 1356 } 1357 case ICMP6_TIME_EXCEEDED: 1358 /* Transient errors */ 1359 break; 1360 case ICMP6_PARAM_PROB: 1361 /* If this corresponds to an ICMP_PROTOCOL_UNREACHABLE */ 1362 if (icmp6->icmp6_code == ICMP6_PARAMPROB_NEXTHEADER && 1363 (uchar_t *)ip6h + icmp6->icmp6_pptr == 1364 (uchar_t *)nexthdrp) { 1365 error = ECONNREFUSED; 1366 break; 1367 } 1368 break; 1369 } 1370 if (error == 0) { 1371 freemsg(mp); 1372 return; 1373 } 1374 1375 /* 1376 * Deliver T_UDERROR_IND when the application has asked for it. 1377 * The socket layer enables this automatically when connected. 1378 */ 1379 if (!connp->conn_dgram_errind) { 1380 freemsg(mp); 1381 return; 1382 } 1383 1384 sin6 = sin6_null; 1385 sin6.sin6_family = AF_INET6; 1386 sin6.sin6_addr = ip6h->ip6_dst; 1387 sin6.sin6_flowinfo = ip6h->ip6_vcf & ~IPV6_VERS_AND_FLOW_MASK; 1388 if (IPCL_IS_NONSTR(connp)) { 1389 mutex_enter(&connp->conn_lock); 1390 if (icmp->icmp_state == TS_DATA_XFER) { 1391 if (IN6_ARE_ADDR_EQUAL(&sin6.sin6_addr, 1392 &connp->conn_faddr_v6)) { 1393 mutex_exit(&connp->conn_lock); 1394 (*connp->conn_upcalls->su_set_error) 1395 (connp->conn_upper_handle, error); 1396 goto done; 1397 } 1398 } else { 1399 icmp->icmp_delayed_error = error; 1400 *((sin6_t *)&icmp->icmp_delayed_addr) = sin6; 1401 } 1402 mutex_exit(&connp->conn_lock); 1403 } else { 1404 mp1 = mi_tpi_uderror_ind((char *)&sin6, sizeof (sin6_t), 1405 NULL, 0, error); 1406 if (mp1 != NULL) 1407 putnext(connp->conn_rq, mp1); 1408 } 1409 done: 1410 freemsg(mp); 1411 } 1412 1413 /* 1414 * This routine responds to T_ADDR_REQ messages. It is called by icmp_wput. 1415 * The local address is filled in if endpoint is bound. The remote address 1416 * is filled in if remote address has been precified ("connected endpoint") 1417 * (The concept of connected CLTS sockets is alien to published TPI 1418 * but we support it anyway). 1419 */ 1420 static void 1421 icmp_addr_req(queue_t *q, mblk_t *mp) 1422 { 1423 struct sockaddr *sa; 1424 mblk_t *ackmp; 1425 struct T_addr_ack *taa; 1426 icmp_t *icmp = Q_TO_ICMP(q); 1427 conn_t *connp = icmp->icmp_connp; 1428 uint_t addrlen; 1429 1430 /* Make it large enough for worst case */ 1431 ackmp = reallocb(mp, sizeof (struct T_addr_ack) + 1432 2 * sizeof (sin6_t), 1); 1433 if (ackmp == NULL) { 1434 icmp_err_ack(q, mp, TSYSERR, ENOMEM); 1435 return; 1436 } 1437 taa = (struct T_addr_ack *)ackmp->b_rptr; 1438 1439 bzero(taa, sizeof (struct T_addr_ack)); 1440 ackmp->b_wptr = (uchar_t *)&taa[1]; 1441 1442 taa->PRIM_type = T_ADDR_ACK; 1443 ackmp->b_datap->db_type = M_PCPROTO; 1444 1445 if (connp->conn_family == AF_INET) 1446 addrlen = sizeof (sin_t); 1447 else 1448 addrlen = sizeof (sin6_t); 1449 1450 mutex_enter(&connp->conn_lock); 1451 /* 1452 * Note: Following code assumes 32 bit alignment of basic 1453 * data structures like sin_t and struct T_addr_ack. 1454 */ 1455 if (icmp->icmp_state != TS_UNBND) { 1456 /* 1457 * Fill in local address first 1458 */ 1459 taa->LOCADDR_offset = sizeof (*taa); 1460 taa->LOCADDR_length = addrlen; 1461 sa = (struct sockaddr *)&taa[1]; 1462 (void) conn_getsockname(connp, sa, &addrlen); 1463 ackmp->b_wptr += addrlen; 1464 } 1465 if (icmp->icmp_state == TS_DATA_XFER) { 1466 /* 1467 * connected, fill remote address too 1468 */ 1469 taa->REMADDR_length = addrlen; 1470 /* assumed 32-bit alignment */ 1471 taa->REMADDR_offset = taa->LOCADDR_offset + taa->LOCADDR_length; 1472 sa = (struct sockaddr *)(ackmp->b_rptr + taa->REMADDR_offset); 1473 (void) conn_getpeername(connp, sa, &addrlen); 1474 ackmp->b_wptr += addrlen; 1475 } 1476 mutex_exit(&connp->conn_lock); 1477 ASSERT(ackmp->b_wptr <= ackmp->b_datap->db_lim); 1478 qreply(q, ackmp); 1479 } 1480 1481 static void 1482 icmp_copy_info(struct T_info_ack *tap, icmp_t *icmp) 1483 { 1484 conn_t *connp = icmp->icmp_connp; 1485 1486 *tap = icmp_g_t_info_ack; 1487 1488 if (connp->conn_family == AF_INET6) 1489 tap->ADDR_size = sizeof (sin6_t); 1490 else 1491 tap->ADDR_size = sizeof (sin_t); 1492 tap->CURRENT_state = icmp->icmp_state; 1493 tap->OPT_size = icmp_max_optsize; 1494 } 1495 1496 static void 1497 icmp_do_capability_ack(icmp_t *icmp, struct T_capability_ack *tcap, 1498 t_uscalar_t cap_bits1) 1499 { 1500 tcap->CAP_bits1 = 0; 1501 1502 if (cap_bits1 & TC1_INFO) { 1503 icmp_copy_info(&tcap->INFO_ack, icmp); 1504 tcap->CAP_bits1 |= TC1_INFO; 1505 } 1506 } 1507 1508 /* 1509 * This routine responds to T_CAPABILITY_REQ messages. It is called by 1510 * icmp_wput. Much of the T_CAPABILITY_ACK information is copied from 1511 * icmp_g_t_info_ack. The current state of the stream is copied from 1512 * icmp_state. 1513 */ 1514 static void 1515 icmp_capability_req(queue_t *q, mblk_t *mp) 1516 { 1517 icmp_t *icmp = Q_TO_ICMP(q); 1518 t_uscalar_t cap_bits1; 1519 struct T_capability_ack *tcap; 1520 1521 cap_bits1 = ((struct T_capability_req *)mp->b_rptr)->CAP_bits1; 1522 1523 mp = tpi_ack_alloc(mp, sizeof (struct T_capability_ack), 1524 mp->b_datap->db_type, T_CAPABILITY_ACK); 1525 if (!mp) 1526 return; 1527 1528 tcap = (struct T_capability_ack *)mp->b_rptr; 1529 1530 icmp_do_capability_ack(icmp, tcap, cap_bits1); 1531 1532 qreply(q, mp); 1533 } 1534 1535 /* 1536 * This routine responds to T_INFO_REQ messages. It is called by icmp_wput. 1537 * Most of the T_INFO_ACK information is copied from icmp_g_t_info_ack. 1538 * The current state of the stream is copied from icmp_state. 1539 */ 1540 static void 1541 icmp_info_req(queue_t *q, mblk_t *mp) 1542 { 1543 icmp_t *icmp = Q_TO_ICMP(q); 1544 1545 /* Create a T_INFO_ACK message. */ 1546 mp = tpi_ack_alloc(mp, sizeof (struct T_info_ack), M_PCPROTO, 1547 T_INFO_ACK); 1548 if (!mp) 1549 return; 1550 icmp_copy_info((struct T_info_ack *)mp->b_rptr, icmp); 1551 qreply(q, mp); 1552 } 1553 1554 static int 1555 icmp_tpi_open(queue_t *q, dev_t *devp, int flag, int sflag, cred_t *credp, 1556 int family) 1557 { 1558 conn_t *connp; 1559 dev_t conn_dev; 1560 int error; 1561 1562 /* If the stream is already open, return immediately. */ 1563 if (q->q_ptr != NULL) 1564 return (0); 1565 1566 if (sflag == MODOPEN) 1567 return (EINVAL); 1568 1569 /* 1570 * Since ICMP is not used so heavily, allocating from the small 1571 * arena should be sufficient. 1572 */ 1573 if ((conn_dev = inet_minor_alloc(ip_minor_arena_sa)) == 0) { 1574 return (EBUSY); 1575 } 1576 1577 if (flag & SO_FALLBACK) { 1578 /* 1579 * Non streams socket needs a stream to fallback to 1580 */ 1581 RD(q)->q_ptr = (void *)conn_dev; 1582 WR(q)->q_qinfo = &icmp_fallback_sock_winit; 1583 WR(q)->q_ptr = (void *)ip_minor_arena_sa; 1584 qprocson(q); 1585 return (0); 1586 } 1587 1588 connp = rawip_do_open(family, credp, &error, KM_SLEEP); 1589 if (connp == NULL) { 1590 ASSERT(error != 0); 1591 inet_minor_free(ip_minor_arena_sa, conn_dev); 1592 return (error); 1593 } 1594 1595 *devp = makedevice(getemajor(*devp), (minor_t)conn_dev); 1596 connp->conn_dev = conn_dev; 1597 connp->conn_minor_arena = ip_minor_arena_sa; 1598 1599 /* 1600 * Initialize the icmp_t structure for this stream. 1601 */ 1602 q->q_ptr = connp; 1603 WR(q)->q_ptr = connp; 1604 connp->conn_rq = q; 1605 connp->conn_wq = WR(q); 1606 1607 WR(q)->q_hiwat = connp->conn_sndbuf; 1608 WR(q)->q_lowat = connp->conn_sndlowat; 1609 1610 qprocson(q); 1611 1612 /* Set the Stream head write offset. */ 1613 (void) proto_set_tx_wroff(q, connp, connp->conn_wroff); 1614 (void) proto_set_rx_hiwat(connp->conn_rq, connp, connp->conn_rcvbuf); 1615 1616 mutex_enter(&connp->conn_lock); 1617 connp->conn_state_flags &= ~CONN_INCIPIENT; 1618 mutex_exit(&connp->conn_lock); 1619 1620 icmp_bind_proto(connp->conn_icmp); 1621 1622 return (0); 1623 } 1624 1625 /* For /dev/icmp aka AF_INET open */ 1626 static int 1627 icmp_openv4(queue_t *q, dev_t *devp, int flag, int sflag, cred_t *credp) 1628 { 1629 return (icmp_tpi_open(q, devp, flag, sflag, credp, AF_INET)); 1630 } 1631 1632 /* For /dev/icmp6 aka AF_INET6 open */ 1633 static int 1634 icmp_openv6(queue_t *q, dev_t *devp, int flag, int sflag, cred_t *credp) 1635 { 1636 return (icmp_tpi_open(q, devp, flag, sflag, credp, AF_INET6)); 1637 } 1638 1639 /* 1640 * This is the open routine for icmp. It allocates a icmp_t structure for 1641 * the stream and, on the first open of the module, creates an ND table. 1642 */ 1643 static conn_t * 1644 rawip_do_open(int family, cred_t *credp, int *err, int flags) 1645 { 1646 icmp_t *icmp; 1647 conn_t *connp; 1648 zoneid_t zoneid; 1649 netstack_t *ns; 1650 icmp_stack_t *is; 1651 int len; 1652 boolean_t isv6 = B_FALSE; 1653 1654 *err = secpolicy_net_icmpaccess(credp); 1655 if (*err != 0) 1656 return (NULL); 1657 1658 if (family == AF_INET6) 1659 isv6 = B_TRUE; 1660 1661 ns = netstack_find_by_cred(credp); 1662 ASSERT(ns != NULL); 1663 is = ns->netstack_icmp; 1664 ASSERT(is != NULL); 1665 1666 /* 1667 * For exclusive stacks we set the zoneid to zero 1668 * to make ICMP operate as if in the global zone. 1669 */ 1670 if (ns->netstack_stackid != GLOBAL_NETSTACKID) 1671 zoneid = GLOBAL_ZONEID; 1672 else 1673 zoneid = crgetzoneid(credp); 1674 1675 ASSERT(flags == KM_SLEEP || flags == KM_NOSLEEP); 1676 1677 connp = ipcl_conn_create(IPCL_RAWIPCONN, flags, ns); 1678 icmp = connp->conn_icmp; 1679 1680 /* 1681 * ipcl_conn_create did a netstack_hold. Undo the hold that was 1682 * done by netstack_find_by_cred() 1683 */ 1684 netstack_rele(ns); 1685 1686 /* 1687 * Since this conn_t/icmp_t is not yet visible to anybody else we don't 1688 * need to lock anything. 1689 */ 1690 ASSERT(connp->conn_proto == IPPROTO_ICMP); 1691 ASSERT(connp->conn_icmp == icmp); 1692 ASSERT(icmp->icmp_connp == connp); 1693 1694 /* Set the initial state of the stream and the privilege status. */ 1695 icmp->icmp_state = TS_UNBND; 1696 connp->conn_ixa->ixa_flags |= IXAF_VERIFY_SOURCE; 1697 if (isv6) { 1698 connp->conn_family = AF_INET6; 1699 connp->conn_ipversion = IPV6_VERSION; 1700 connp->conn_ixa->ixa_flags &= ~IXAF_IS_IPV4; 1701 connp->conn_proto = IPPROTO_ICMPV6; 1702 /* May be changed by a SO_PROTOTYPE socket option. */ 1703 connp->conn_proto = IPPROTO_ICMPV6; 1704 connp->conn_ixa->ixa_protocol = connp->conn_proto; 1705 connp->conn_ixa->ixa_raw_cksum_offset = 2; 1706 connp->conn_default_ttl = is->is_ipv6_hoplimit; 1707 len = sizeof (ip6_t); 1708 } else { 1709 connp->conn_family = AF_INET; 1710 connp->conn_ipversion = IPV4_VERSION; 1711 connp->conn_ixa->ixa_flags |= IXAF_IS_IPV4; 1712 /* May be changed by a SO_PROTOTYPE socket option. */ 1713 connp->conn_proto = IPPROTO_ICMP; 1714 connp->conn_ixa->ixa_protocol = connp->conn_proto; 1715 connp->conn_default_ttl = is->is_ipv4_ttl; 1716 len = sizeof (ipha_t); 1717 } 1718 connp->conn_xmit_ipp.ipp_unicast_hops = connp->conn_default_ttl; 1719 1720 connp->conn_ixa->ixa_multicast_ttl = IP_DEFAULT_MULTICAST_TTL; 1721 1722 /* 1723 * For the socket of protocol IPPROTO_RAW or when IP_HDRINCL is set, 1724 * the checksum is provided in the pre-built packet. We clear 1725 * IXAF_SET_ULP_CKSUM to tell IP that the application has sent a 1726 * complete IP header and not to compute the transport checksum. 1727 */ 1728 connp->conn_ixa->ixa_flags |= IXAF_MULTICAST_LOOP | IXAF_SET_ULP_CKSUM; 1729 /* conn_allzones can not be set this early, hence no IPCL_ZONEID */ 1730 connp->conn_ixa->ixa_zoneid = zoneid; 1731 1732 connp->conn_zoneid = zoneid; 1733 1734 /* 1735 * If the caller has the process-wide flag set, then default to MAC 1736 * exempt mode. This allows read-down to unlabeled hosts. 1737 */ 1738 if (getpflags(NET_MAC_AWARE, credp) != 0) 1739 connp->conn_mac_mode = CONN_MAC_AWARE; 1740 1741 connp->conn_zone_is_global = (crgetzoneid(credp) == GLOBAL_ZONEID); 1742 1743 icmp->icmp_is = is; 1744 1745 connp->conn_rcvbuf = is->is_recv_hiwat; 1746 connp->conn_sndbuf = is->is_xmit_hiwat; 1747 connp->conn_sndlowat = is->is_xmit_lowat; 1748 connp->conn_rcvlowat = icmp_mod_info.mi_lowat; 1749 1750 connp->conn_wroff = len + is->is_wroff_extra; 1751 connp->conn_so_type = SOCK_RAW; 1752 1753 connp->conn_recv = icmp_input; 1754 connp->conn_recvicmp = icmp_icmp_input; 1755 crhold(credp); 1756 connp->conn_cred = credp; 1757 connp->conn_cpid = curproc->p_pid; 1758 connp->conn_open_time = ddi_get_lbolt64(); 1759 /* Cache things in ixa without an extra refhold */ 1760 ASSERT(!(connp->conn_ixa->ixa_free_flags & IXA_FREE_CRED)); 1761 connp->conn_ixa->ixa_cred = connp->conn_cred; 1762 connp->conn_ixa->ixa_cpid = connp->conn_cpid; 1763 if (is_system_labeled()) 1764 connp->conn_ixa->ixa_tsl = crgetlabel(connp->conn_cred); 1765 1766 connp->conn_flow_cntrld = B_FALSE; 1767 1768 if (is->is_pmtu_discovery) 1769 connp->conn_ixa->ixa_flags |= IXAF_PMTU_DISCOVERY; 1770 1771 return (connp); 1772 } 1773 1774 /* 1775 * Which ICMP options OK to set through T_UNITDATA_REQ... 1776 */ 1777 /* ARGSUSED */ 1778 static boolean_t 1779 icmp_opt_allow_udr_set(t_scalar_t level, t_scalar_t name) 1780 { 1781 return (B_TRUE); 1782 } 1783 1784 /* 1785 * This routine gets default values of certain options whose default 1786 * values are maintained by protcol specific code 1787 */ 1788 int 1789 icmp_opt_default(queue_t *q, t_scalar_t level, t_scalar_t name, uchar_t *ptr) 1790 { 1791 icmp_t *icmp = Q_TO_ICMP(q); 1792 icmp_stack_t *is = icmp->icmp_is; 1793 int *i1 = (int *)ptr; 1794 1795 switch (level) { 1796 case IPPROTO_IP: 1797 switch (name) { 1798 case IP_MULTICAST_TTL: 1799 *ptr = (uchar_t)IP_DEFAULT_MULTICAST_TTL; 1800 return (sizeof (uchar_t)); 1801 case IP_MULTICAST_LOOP: 1802 *ptr = (uchar_t)IP_DEFAULT_MULTICAST_LOOP; 1803 return (sizeof (uchar_t)); 1804 } 1805 break; 1806 case IPPROTO_IPV6: 1807 switch (name) { 1808 case IPV6_MULTICAST_HOPS: 1809 *i1 = IP_DEFAULT_MULTICAST_TTL; 1810 return (sizeof (int)); 1811 case IPV6_MULTICAST_LOOP: 1812 *i1 = IP_DEFAULT_MULTICAST_LOOP; 1813 return (sizeof (int)); 1814 case IPV6_UNICAST_HOPS: 1815 *i1 = is->is_ipv6_hoplimit; 1816 return (sizeof (int)); 1817 } 1818 break; 1819 case IPPROTO_ICMPV6: 1820 switch (name) { 1821 case ICMP6_FILTER: 1822 /* Make it look like "pass all" */ 1823 ICMP6_FILTER_SETPASSALL((icmp6_filter_t *)ptr); 1824 return (sizeof (icmp6_filter_t)); 1825 } 1826 break; 1827 } 1828 return (-1); 1829 } 1830 1831 /* 1832 * This routine retrieves the current status of socket options. 1833 * It returns the size of the option retrieved, or -1. 1834 */ 1835 int 1836 icmp_opt_get(conn_t *connp, int level, int name, uchar_t *ptr) 1837 { 1838 icmp_t *icmp = connp->conn_icmp; 1839 int *i1 = (int *)ptr; 1840 conn_opt_arg_t coas; 1841 int retval; 1842 1843 coas.coa_connp = connp; 1844 coas.coa_ixa = connp->conn_ixa; 1845 coas.coa_ipp = &connp->conn_xmit_ipp; 1846 coas.coa_ancillary = B_FALSE; 1847 coas.coa_changed = 0; 1848 1849 /* 1850 * We assume that the optcom framework has checked for the set 1851 * of levels and names that are supported, hence we don't worry 1852 * about rejecting based on that. 1853 * First check for ICMP specific handling, then pass to common routine. 1854 */ 1855 switch (level) { 1856 case IPPROTO_IP: 1857 /* 1858 * Only allow IPv4 option processing on IPv4 sockets. 1859 */ 1860 if (connp->conn_family != AF_INET) 1861 return (-1); 1862 1863 switch (name) { 1864 case IP_OPTIONS: 1865 case T_IP_OPTIONS: 1866 /* Options are passed up with each packet */ 1867 return (0); 1868 case IP_HDRINCL: 1869 mutex_enter(&connp->conn_lock); 1870 *i1 = (int)icmp->icmp_hdrincl; 1871 mutex_exit(&connp->conn_lock); 1872 return (sizeof (int)); 1873 } 1874 break; 1875 1876 case IPPROTO_IPV6: 1877 /* 1878 * Only allow IPv6 option processing on native IPv6 sockets. 1879 */ 1880 if (connp->conn_family != AF_INET6) 1881 return (-1); 1882 1883 switch (name) { 1884 case IPV6_CHECKSUM: 1885 /* 1886 * Return offset or -1 if no checksum offset. 1887 * Does not apply to IPPROTO_ICMPV6 1888 */ 1889 if (connp->conn_proto == IPPROTO_ICMPV6) 1890 return (-1); 1891 1892 mutex_enter(&connp->conn_lock); 1893 if (connp->conn_ixa->ixa_flags & IXAF_SET_RAW_CKSUM) 1894 *i1 = connp->conn_ixa->ixa_raw_cksum_offset; 1895 else 1896 *i1 = -1; 1897 mutex_exit(&connp->conn_lock); 1898 return (sizeof (int)); 1899 } 1900 break; 1901 1902 case IPPROTO_ICMPV6: 1903 /* 1904 * Only allow IPv6 option processing on native IPv6 sockets. 1905 */ 1906 if (connp->conn_family != AF_INET6) 1907 return (-1); 1908 1909 if (connp->conn_proto != IPPROTO_ICMPV6) 1910 return (-1); 1911 1912 switch (name) { 1913 case ICMP6_FILTER: 1914 mutex_enter(&connp->conn_lock); 1915 if (icmp->icmp_filter == NULL) { 1916 /* Make it look like "pass all" */ 1917 ICMP6_FILTER_SETPASSALL((icmp6_filter_t *)ptr); 1918 } else { 1919 (void) bcopy(icmp->icmp_filter, ptr, 1920 sizeof (icmp6_filter_t)); 1921 } 1922 mutex_exit(&connp->conn_lock); 1923 return (sizeof (icmp6_filter_t)); 1924 } 1925 } 1926 mutex_enter(&connp->conn_lock); 1927 retval = conn_opt_get(&coas, level, name, ptr); 1928 mutex_exit(&connp->conn_lock); 1929 return (retval); 1930 } 1931 1932 /* 1933 * This routine retrieves the current status of socket options. 1934 * It returns the size of the option retrieved, or -1. 1935 */ 1936 int 1937 icmp_tpi_opt_get(queue_t *q, int level, int name, uchar_t *ptr) 1938 { 1939 conn_t *connp = Q_TO_CONN(q); 1940 int err; 1941 1942 err = icmp_opt_get(connp, level, name, ptr); 1943 return (err); 1944 } 1945 1946 /* 1947 * This routine sets socket options. 1948 */ 1949 int 1950 icmp_do_opt_set(conn_opt_arg_t *coa, int level, int name, 1951 uint_t inlen, uchar_t *invalp, cred_t *cr, boolean_t checkonly) 1952 { 1953 conn_t *connp = coa->coa_connp; 1954 ip_xmit_attr_t *ixa = coa->coa_ixa; 1955 icmp_t *icmp = connp->conn_icmp; 1956 icmp_stack_t *is = icmp->icmp_is; 1957 int *i1 = (int *)invalp; 1958 boolean_t onoff = (*i1 == 0) ? 0 : 1; 1959 int error; 1960 1961 ASSERT(MUTEX_NOT_HELD(&coa->coa_connp->conn_lock)); 1962 1963 /* 1964 * For fixed length options, no sanity check 1965 * of passed in length is done. It is assumed *_optcom_req() 1966 * routines do the right thing. 1967 */ 1968 1969 switch (level) { 1970 case SOL_SOCKET: 1971 switch (name) { 1972 case SO_PROTOTYPE: 1973 if ((*i1 & 0xFF) != IPPROTO_ICMP && 1974 (*i1 & 0xFF) != IPPROTO_ICMPV6 && 1975 secpolicy_net_rawaccess(cr) != 0) { 1976 return (EACCES); 1977 } 1978 if (checkonly) 1979 break; 1980 1981 mutex_enter(&connp->conn_lock); 1982 connp->conn_proto = *i1 & 0xFF; 1983 ixa->ixa_protocol = connp->conn_proto; 1984 if ((connp->conn_proto == IPPROTO_RAW || 1985 connp->conn_proto == IPPROTO_IGMP) && 1986 connp->conn_family == AF_INET) { 1987 icmp->icmp_hdrincl = 1; 1988 ixa->ixa_flags &= ~IXAF_SET_ULP_CKSUM; 1989 } else if (connp->conn_proto == IPPROTO_UDP || 1990 connp->conn_proto == IPPROTO_TCP || 1991 connp->conn_proto == IPPROTO_SCTP) { 1992 /* Used by test applications like psh */ 1993 icmp->icmp_hdrincl = 0; 1994 ixa->ixa_flags &= ~IXAF_SET_ULP_CKSUM; 1995 } else { 1996 icmp->icmp_hdrincl = 0; 1997 ixa->ixa_flags |= IXAF_SET_ULP_CKSUM; 1998 } 1999 2000 if (connp->conn_family == AF_INET6 && 2001 connp->conn_proto == IPPROTO_ICMPV6) { 2002 /* Set offset for icmp6_cksum */ 2003 ixa->ixa_flags &= ~IXAF_SET_RAW_CKSUM; 2004 ixa->ixa_raw_cksum_offset = 2; 2005 } 2006 if (icmp->icmp_filter != NULL && 2007 connp->conn_proto != IPPROTO_ICMPV6) { 2008 kmem_free(icmp->icmp_filter, 2009 sizeof (icmp6_filter_t)); 2010 icmp->icmp_filter = NULL; 2011 } 2012 mutex_exit(&connp->conn_lock); 2013 2014 coa->coa_changed |= COA_HEADER_CHANGED; 2015 /* 2016 * For SCTP, we don't use icmp_bind_proto() for 2017 * raw socket binding. 2018 */ 2019 if (connp->conn_proto == IPPROTO_SCTP) 2020 return (0); 2021 2022 coa->coa_changed |= COA_ICMP_BIND_NEEDED; 2023 return (0); 2024 2025 case SO_SNDBUF: 2026 if (*i1 > is->is_max_buf) { 2027 return (ENOBUFS); 2028 } 2029 break; 2030 case SO_RCVBUF: 2031 if (*i1 > is->is_max_buf) { 2032 return (ENOBUFS); 2033 } 2034 break; 2035 } 2036 break; 2037 2038 case IPPROTO_IP: 2039 /* 2040 * Only allow IPv4 option processing on IPv4 sockets. 2041 */ 2042 if (connp->conn_family != AF_INET) 2043 return (EINVAL); 2044 2045 switch (name) { 2046 case IP_HDRINCL: 2047 if (!checkonly) { 2048 mutex_enter(&connp->conn_lock); 2049 icmp->icmp_hdrincl = onoff; 2050 if (onoff) 2051 ixa->ixa_flags &= ~IXAF_SET_ULP_CKSUM; 2052 else 2053 ixa->ixa_flags |= IXAF_SET_ULP_CKSUM; 2054 mutex_exit(&connp->conn_lock); 2055 } 2056 break; 2057 } 2058 break; 2059 2060 case IPPROTO_IPV6: 2061 if (connp->conn_family != AF_INET6) 2062 return (EINVAL); 2063 2064 switch (name) { 2065 case IPV6_CHECKSUM: 2066 /* 2067 * Integer offset into the user data of where the 2068 * checksum is located. 2069 * Offset of -1 disables option. 2070 * Does not apply to IPPROTO_ICMPV6. 2071 */ 2072 if (connp->conn_proto == IPPROTO_ICMPV6 || 2073 coa->coa_ancillary) { 2074 return (EINVAL); 2075 } 2076 if ((*i1 != -1) && ((*i1 < 0) || (*i1 & 0x1) != 0)) { 2077 /* Negative or not 16 bit aligned offset */ 2078 return (EINVAL); 2079 } 2080 if (checkonly) 2081 break; 2082 2083 mutex_enter(&connp->conn_lock); 2084 if (*i1 == -1) { 2085 ixa->ixa_flags &= ~IXAF_SET_RAW_CKSUM; 2086 ixa->ixa_raw_cksum_offset = 0; 2087 ixa->ixa_flags &= ~IXAF_SET_ULP_CKSUM; 2088 } else { 2089 ixa->ixa_flags |= IXAF_SET_RAW_CKSUM; 2090 ixa->ixa_raw_cksum_offset = *i1; 2091 ixa->ixa_flags |= IXAF_SET_ULP_CKSUM; 2092 } 2093 mutex_exit(&connp->conn_lock); 2094 break; 2095 } 2096 break; 2097 2098 case IPPROTO_ICMPV6: 2099 /* 2100 * Only allow IPv6 option processing on IPv6 sockets. 2101 */ 2102 if (connp->conn_family != AF_INET6) 2103 return (EINVAL); 2104 if (connp->conn_proto != IPPROTO_ICMPV6) 2105 return (EINVAL); 2106 2107 switch (name) { 2108 case ICMP6_FILTER: 2109 if (checkonly) 2110 break; 2111 2112 if ((inlen != 0) && 2113 (inlen != sizeof (icmp6_filter_t))) 2114 return (EINVAL); 2115 2116 mutex_enter(&connp->conn_lock); 2117 if (inlen == 0) { 2118 if (icmp->icmp_filter != NULL) { 2119 kmem_free(icmp->icmp_filter, 2120 sizeof (icmp6_filter_t)); 2121 icmp->icmp_filter = NULL; 2122 } 2123 } else { 2124 if (icmp->icmp_filter == NULL) { 2125 icmp->icmp_filter = kmem_alloc( 2126 sizeof (icmp6_filter_t), 2127 KM_NOSLEEP); 2128 if (icmp->icmp_filter == NULL) { 2129 mutex_exit(&connp->conn_lock); 2130 return (ENOBUFS); 2131 } 2132 } 2133 (void) bcopy(invalp, icmp->icmp_filter, inlen); 2134 } 2135 mutex_exit(&connp->conn_lock); 2136 break; 2137 } 2138 break; 2139 } 2140 error = conn_opt_set(coa, level, name, inlen, invalp, 2141 checkonly, cr); 2142 return (error); 2143 } 2144 2145 /* 2146 * This routine sets socket options. 2147 */ 2148 int 2149 icmp_opt_set(conn_t *connp, uint_t optset_context, int level, int name, 2150 uint_t inlen, uchar_t *invalp, uint_t *outlenp, uchar_t *outvalp, 2151 void *thisdg_attrs, cred_t *cr) 2152 { 2153 icmp_t *icmp = connp->conn_icmp; 2154 int err; 2155 conn_opt_arg_t coas, *coa; 2156 boolean_t checkonly; 2157 icmp_stack_t *is = icmp->icmp_is; 2158 2159 switch (optset_context) { 2160 case SETFN_OPTCOM_CHECKONLY: 2161 checkonly = B_TRUE; 2162 /* 2163 * Note: Implies T_CHECK semantics for T_OPTCOM_REQ 2164 * inlen != 0 implies value supplied and 2165 * we have to "pretend" to set it. 2166 * inlen == 0 implies that there is no 2167 * value part in T_CHECK request and just validation 2168 * done elsewhere should be enough, we just return here. 2169 */ 2170 if (inlen == 0) { 2171 *outlenp = 0; 2172 return (0); 2173 } 2174 break; 2175 case SETFN_OPTCOM_NEGOTIATE: 2176 checkonly = B_FALSE; 2177 break; 2178 case SETFN_UD_NEGOTIATE: 2179 case SETFN_CONN_NEGOTIATE: 2180 checkonly = B_FALSE; 2181 /* 2182 * Negotiating local and "association-related" options 2183 * through T_UNITDATA_REQ. 2184 * 2185 * Following routine can filter out ones we do not 2186 * want to be "set" this way. 2187 */ 2188 if (!icmp_opt_allow_udr_set(level, name)) { 2189 *outlenp = 0; 2190 return (EINVAL); 2191 } 2192 break; 2193 default: 2194 /* 2195 * We should never get here 2196 */ 2197 *outlenp = 0; 2198 return (EINVAL); 2199 } 2200 2201 ASSERT((optset_context != SETFN_OPTCOM_CHECKONLY) || 2202 (optset_context == SETFN_OPTCOM_CHECKONLY && inlen != 0)); 2203 2204 if (thisdg_attrs != NULL) { 2205 /* Options from T_UNITDATA_REQ */ 2206 coa = (conn_opt_arg_t *)thisdg_attrs; 2207 ASSERT(coa->coa_connp == connp); 2208 ASSERT(coa->coa_ixa != NULL); 2209 ASSERT(coa->coa_ipp != NULL); 2210 ASSERT(coa->coa_ancillary); 2211 } else { 2212 coa = &coas; 2213 coas.coa_connp = connp; 2214 /* Get a reference on conn_ixa to prevent concurrent mods */ 2215 coas.coa_ixa = conn_get_ixa(connp, B_TRUE); 2216 if (coas.coa_ixa == NULL) { 2217 *outlenp = 0; 2218 return (ENOMEM); 2219 } 2220 coas.coa_ipp = &connp->conn_xmit_ipp; 2221 coas.coa_ancillary = B_FALSE; 2222 coas.coa_changed = 0; 2223 } 2224 2225 err = icmp_do_opt_set(coa, level, name, inlen, invalp, 2226 cr, checkonly); 2227 if (err != 0) { 2228 errout: 2229 if (!coa->coa_ancillary) 2230 ixa_refrele(coa->coa_ixa); 2231 *outlenp = 0; 2232 return (err); 2233 } 2234 2235 /* 2236 * Common case of OK return with outval same as inval. 2237 */ 2238 if (invalp != outvalp) { 2239 /* don't trust bcopy for identical src/dst */ 2240 (void) bcopy(invalp, outvalp, inlen); 2241 } 2242 *outlenp = inlen; 2243 2244 /* 2245 * If this was not ancillary data, then we rebuild the headers, 2246 * update the IRE/NCE, and IPsec as needed. 2247 * Since the label depends on the destination we go through 2248 * ip_set_destination first. 2249 */ 2250 if (coa->coa_ancillary) { 2251 return (0); 2252 } 2253 2254 if (coa->coa_changed & COA_ROUTE_CHANGED) { 2255 in6_addr_t saddr, faddr, nexthop; 2256 in_port_t fport; 2257 2258 /* 2259 * We clear lastdst to make sure we pick up the change 2260 * next time sending. 2261 * If we are connected we re-cache the information. 2262 * We ignore errors to preserve BSD behavior. 2263 * Note that we don't redo IPsec policy lookup here 2264 * since the final destination (or source) didn't change. 2265 */ 2266 mutex_enter(&connp->conn_lock); 2267 connp->conn_v6lastdst = ipv6_all_zeros; 2268 2269 ip_attr_nexthop(coa->coa_ipp, coa->coa_ixa, 2270 &connp->conn_faddr_v6, &nexthop); 2271 saddr = connp->conn_saddr_v6; 2272 faddr = connp->conn_faddr_v6; 2273 fport = connp->conn_fport; 2274 mutex_exit(&connp->conn_lock); 2275 2276 if (!IN6_IS_ADDR_UNSPECIFIED(&faddr) && 2277 !IN6_IS_ADDR_V4MAPPED_ANY(&faddr)) { 2278 (void) ip_attr_connect(connp, coa->coa_ixa, 2279 &saddr, &faddr, &nexthop, fport, NULL, NULL, 2280 IPDF_ALLOW_MCBC | IPDF_VERIFY_DST); 2281 } 2282 } 2283 2284 ixa_refrele(coa->coa_ixa); 2285 2286 if (coa->coa_changed & COA_HEADER_CHANGED) { 2287 /* 2288 * Rebuild the header template if we are connected. 2289 * Otherwise clear conn_v6lastdst so we rebuild the header 2290 * in the data path. 2291 */ 2292 mutex_enter(&connp->conn_lock); 2293 if (!IN6_IS_ADDR_UNSPECIFIED(&connp->conn_faddr_v6) && 2294 !IN6_IS_ADDR_V4MAPPED_ANY(&connp->conn_faddr_v6)) { 2295 err = icmp_build_hdr_template(connp, 2296 &connp->conn_saddr_v6, &connp->conn_faddr_v6, 2297 connp->conn_flowinfo); 2298 if (err != 0) { 2299 mutex_exit(&connp->conn_lock); 2300 return (err); 2301 } 2302 } else { 2303 connp->conn_v6lastdst = ipv6_all_zeros; 2304 } 2305 mutex_exit(&connp->conn_lock); 2306 } 2307 if (coa->coa_changed & COA_RCVBUF_CHANGED) { 2308 (void) proto_set_rx_hiwat(connp->conn_rq, connp, 2309 connp->conn_rcvbuf); 2310 } 2311 if ((coa->coa_changed & COA_SNDBUF_CHANGED) && !IPCL_IS_NONSTR(connp)) { 2312 connp->conn_wq->q_hiwat = connp->conn_sndbuf; 2313 } 2314 if (coa->coa_changed & COA_WROFF_CHANGED) { 2315 /* Increase wroff if needed */ 2316 uint_t wroff; 2317 2318 mutex_enter(&connp->conn_lock); 2319 wroff = connp->conn_ht_iphc_allocated + is->is_wroff_extra; 2320 if (wroff > connp->conn_wroff) { 2321 connp->conn_wroff = wroff; 2322 mutex_exit(&connp->conn_lock); 2323 (void) proto_set_tx_wroff(connp->conn_rq, connp, wroff); 2324 } else { 2325 mutex_exit(&connp->conn_lock); 2326 } 2327 } 2328 if (coa->coa_changed & COA_ICMP_BIND_NEEDED) { 2329 icmp_bind_proto(icmp); 2330 } 2331 return (err); 2332 } 2333 2334 /* This routine sets socket options. */ 2335 int 2336 icmp_tpi_opt_set(queue_t *q, uint_t optset_context, int level, int name, 2337 uint_t inlen, uchar_t *invalp, uint_t *outlenp, uchar_t *outvalp, 2338 void *thisdg_attrs, cred_t *cr) 2339 { 2340 conn_t *connp = Q_TO_CONN(q); 2341 int error; 2342 2343 error = icmp_opt_set(connp, optset_context, level, name, inlen, invalp, 2344 outlenp, outvalp, thisdg_attrs, cr); 2345 return (error); 2346 } 2347 2348 /* 2349 * Setup IP headers. 2350 * 2351 * Note that IP_HDRINCL has ipha_protocol that is different than conn_proto, 2352 * but icmp_output_hdrincl restores ipha_protocol once we return. 2353 */ 2354 mblk_t * 2355 icmp_prepend_hdr(conn_t *connp, ip_xmit_attr_t *ixa, const ip_pkt_t *ipp, 2356 const in6_addr_t *v6src, const in6_addr_t *v6dst, uint32_t flowinfo, 2357 mblk_t *data_mp, int *errorp) 2358 { 2359 mblk_t *mp; 2360 icmp_stack_t *is = connp->conn_netstack->netstack_icmp; 2361 uint_t data_len; 2362 uint32_t cksum; 2363 2364 data_len = msgdsize(data_mp); 2365 mp = conn_prepend_hdr(ixa, ipp, v6src, v6dst, connp->conn_proto, 2366 flowinfo, 0, data_mp, data_len, is->is_wroff_extra, &cksum, errorp); 2367 if (mp == NULL) { 2368 ASSERT(*errorp != 0); 2369 return (NULL); 2370 } 2371 2372 ixa->ixa_pktlen = data_len + ixa->ixa_ip_hdr_length; 2373 2374 /* 2375 * If there was a routing option/header then conn_prepend_hdr 2376 * has massaged it and placed the pseudo-header checksum difference 2377 * in the cksum argument. 2378 * 2379 * Prepare for ICMPv6 checksum done in IP. 2380 * 2381 * We make it easy for IP to include our pseudo header 2382 * by putting our length (and any routing header adjustment) 2383 * in the ICMPv6 checksum field. 2384 * The IP source, destination, and length have already been set by 2385 * conn_prepend_hdr. 2386 */ 2387 cksum += data_len; 2388 cksum = (cksum >> 16) + (cksum & 0xFFFF); 2389 ASSERT(cksum < 0x10000); 2390 2391 if (ixa->ixa_flags & IXAF_IS_IPV4) { 2392 ipha_t *ipha = (ipha_t *)mp->b_rptr; 2393 2394 ASSERT(ntohs(ipha->ipha_length) == ixa->ixa_pktlen); 2395 } else { 2396 ip6_t *ip6h = (ip6_t *)mp->b_rptr; 2397 uint_t cksum_offset = 0; 2398 2399 ASSERT(ntohs(ip6h->ip6_plen) + IPV6_HDR_LEN == ixa->ixa_pktlen); 2400 2401 if (ixa->ixa_flags & IXAF_SET_ULP_CKSUM) { 2402 if (connp->conn_proto == IPPROTO_ICMPV6) { 2403 cksum_offset = ixa->ixa_ip_hdr_length + 2404 offsetof(icmp6_t, icmp6_cksum); 2405 } else if (ixa->ixa_flags & IXAF_SET_RAW_CKSUM) { 2406 cksum_offset = ixa->ixa_ip_hdr_length + 2407 ixa->ixa_raw_cksum_offset; 2408 } 2409 } 2410 if (cksum_offset != 0) { 2411 uint16_t *ptr; 2412 2413 /* Make sure the checksum fits in the first mblk */ 2414 if (cksum_offset + sizeof (short) > MBLKL(mp)) { 2415 mblk_t *mp1; 2416 2417 mp1 = msgpullup(mp, 2418 cksum_offset + sizeof (short)); 2419 freemsg(mp); 2420 if (mp1 == NULL) { 2421 *errorp = ENOMEM; 2422 return (NULL); 2423 } 2424 mp = mp1; 2425 ip6h = (ip6_t *)mp->b_rptr; 2426 } 2427 ptr = (uint16_t *)(mp->b_rptr + cksum_offset); 2428 *ptr = htons(cksum); 2429 } 2430 } 2431 2432 /* Note that we don't try to update wroff due to ancillary data */ 2433 return (mp); 2434 } 2435 2436 static int 2437 icmp_build_hdr_template(conn_t *connp, const in6_addr_t *v6src, 2438 const in6_addr_t *v6dst, uint32_t flowinfo) 2439 { 2440 int error; 2441 2442 ASSERT(MUTEX_HELD(&connp->conn_lock)); 2443 /* 2444 * We clear lastdst to make sure we don't use the lastdst path 2445 * next time sending since we might not have set v6dst yet. 2446 */ 2447 connp->conn_v6lastdst = ipv6_all_zeros; 2448 2449 error = conn_build_hdr_template(connp, 0, 0, v6src, v6dst, flowinfo); 2450 if (error != 0) 2451 return (error); 2452 2453 /* 2454 * Any routing header/option has been massaged. The checksum difference 2455 * is stored in conn_sum. 2456 */ 2457 return (0); 2458 } 2459 2460 static mblk_t * 2461 icmp_queue_fallback(icmp_t *icmp, mblk_t *mp) 2462 { 2463 ASSERT(MUTEX_HELD(&icmp->icmp_recv_lock)); 2464 if (IPCL_IS_NONSTR(icmp->icmp_connp)) { 2465 /* 2466 * fallback has started but messages have not been moved yet 2467 */ 2468 if (icmp->icmp_fallback_queue_head == NULL) { 2469 ASSERT(icmp->icmp_fallback_queue_tail == NULL); 2470 icmp->icmp_fallback_queue_head = mp; 2471 icmp->icmp_fallback_queue_tail = mp; 2472 } else { 2473 ASSERT(icmp->icmp_fallback_queue_tail != NULL); 2474 icmp->icmp_fallback_queue_tail->b_next = mp; 2475 icmp->icmp_fallback_queue_tail = mp; 2476 } 2477 return (NULL); 2478 } else { 2479 /* 2480 * Fallback completed, let the caller putnext() the mblk. 2481 */ 2482 return (mp); 2483 } 2484 } 2485 2486 /* 2487 * Deliver data to ULP. In case we have a socket, and it's falling back to 2488 * TPI, then we'll queue the mp for later processing. 2489 */ 2490 static void 2491 icmp_ulp_recv(conn_t *connp, mblk_t *mp, uint_t len) 2492 { 2493 if (IPCL_IS_NONSTR(connp)) { 2494 icmp_t *icmp = connp->conn_icmp; 2495 int error; 2496 2497 ASSERT(len == msgdsize(mp)); 2498 if ((*connp->conn_upcalls->su_recv) 2499 (connp->conn_upper_handle, mp, len, 0, &error, NULL) < 0) { 2500 mutex_enter(&icmp->icmp_recv_lock); 2501 if (error == ENOSPC) { 2502 /* 2503 * let's confirm while holding the lock 2504 */ 2505 if ((*connp->conn_upcalls->su_recv) 2506 (connp->conn_upper_handle, NULL, 0, 0, 2507 &error, NULL) < 0) { 2508 ASSERT(error == ENOSPC); 2509 if (error == ENOSPC) { 2510 connp->conn_flow_cntrld = 2511 B_TRUE; 2512 } 2513 } 2514 mutex_exit(&icmp->icmp_recv_lock); 2515 } else { 2516 ASSERT(error == EOPNOTSUPP); 2517 mp = icmp_queue_fallback(icmp, mp); 2518 mutex_exit(&icmp->icmp_recv_lock); 2519 if (mp != NULL) 2520 putnext(connp->conn_rq, mp); 2521 } 2522 } 2523 ASSERT(MUTEX_NOT_HELD(&icmp->icmp_recv_lock)); 2524 } else { 2525 putnext(connp->conn_rq, mp); 2526 } 2527 } 2528 2529 /* 2530 * This is the inbound data path. 2531 * IP has already pulled up the IP headers and verified alignment 2532 * etc. 2533 */ 2534 /* ARGSUSED2 */ 2535 static void 2536 icmp_input(void *arg1, mblk_t *mp, void *arg2, ip_recv_attr_t *ira) 2537 { 2538 conn_t *connp = (conn_t *)arg1; 2539 struct T_unitdata_ind *tudi; 2540 uchar_t *rptr; /* Pointer to IP header */ 2541 int ip_hdr_length; 2542 int udi_size; /* Size of T_unitdata_ind */ 2543 int pkt_len; 2544 icmp_t *icmp; 2545 ip_pkt_t ipps; 2546 ip6_t *ip6h; 2547 mblk_t *mp1; 2548 crb_t recv_ancillary; 2549 icmp_stack_t *is; 2550 sin_t *sin; 2551 sin6_t *sin6; 2552 ipha_t *ipha; 2553 2554 ASSERT(connp->conn_flags & IPCL_RAWIPCONN); 2555 2556 icmp = connp->conn_icmp; 2557 is = icmp->icmp_is; 2558 rptr = mp->b_rptr; 2559 2560 ASSERT(DB_TYPE(mp) == M_DATA); 2561 ASSERT(OK_32PTR(rptr)); 2562 ASSERT(ira->ira_pktlen == msgdsize(mp)); 2563 pkt_len = ira->ira_pktlen; 2564 2565 /* 2566 * Get a snapshot of these and allow other threads to change 2567 * them after that. We need the same recv_ancillary when determining 2568 * the size as when adding the ancillary data items. 2569 */ 2570 mutex_enter(&connp->conn_lock); 2571 recv_ancillary = connp->conn_recv_ancillary; 2572 mutex_exit(&connp->conn_lock); 2573 2574 ip_hdr_length = ira->ira_ip_hdr_length; 2575 ASSERT(MBLKL(mp) >= ip_hdr_length); /* IP did a pullup */ 2576 2577 /* Initialize regardless of IP version */ 2578 ipps.ipp_fields = 0; 2579 2580 if (ira->ira_flags & IRAF_IS_IPV4) { 2581 ASSERT(IPH_HDR_VERSION(rptr) == IPV4_VERSION); 2582 ASSERT(MBLKL(mp) >= sizeof (ipha_t)); 2583 ASSERT(ira->ira_ip_hdr_length == IPH_HDR_LENGTH(rptr)); 2584 2585 ipha = (ipha_t *)mp->b_rptr; 2586 if (recv_ancillary.crb_all != 0) 2587 (void) ip_find_hdr_v4(ipha, &ipps, B_FALSE); 2588 2589 /* 2590 * BSD for some reason adjusts ipha_length to exclude the 2591 * IP header length. We do the same. 2592 */ 2593 if (is->is_bsd_compat) { 2594 ushort_t len; 2595 2596 len = ntohs(ipha->ipha_length); 2597 if (mp->b_datap->db_ref > 1) { 2598 /* 2599 * Allocate a new IP header so that we can 2600 * modify ipha_length. 2601 */ 2602 mblk_t *mp1; 2603 2604 mp1 = allocb(ip_hdr_length, BPRI_MED); 2605 if (mp1 == NULL) { 2606 freemsg(mp); 2607 BUMP_MIB(&is->is_rawip_mib, 2608 rawipInErrors); 2609 return; 2610 } 2611 bcopy(rptr, mp1->b_rptr, ip_hdr_length); 2612 mp->b_rptr = rptr + ip_hdr_length; 2613 rptr = mp1->b_rptr; 2614 ipha = (ipha_t *)rptr; 2615 mp1->b_cont = mp; 2616 mp1->b_wptr = rptr + ip_hdr_length; 2617 mp = mp1; 2618 } 2619 len -= ip_hdr_length; 2620 ipha->ipha_length = htons(len); 2621 } 2622 2623 /* 2624 * For RAW sockets we not pass ICMP/IPv4 packets to AF_INET6 2625 * sockets. This is ensured by icmp_bind and the IP fanout code. 2626 */ 2627 ASSERT(connp->conn_family == AF_INET); 2628 2629 /* 2630 * This is the inbound data path. Packets are passed upstream 2631 * as T_UNITDATA_IND messages with full IPv4 headers still 2632 * attached. 2633 */ 2634 2635 /* 2636 * Normally only send up the source address. 2637 * If any ancillary data items are wanted we add those. 2638 */ 2639 udi_size = sizeof (struct T_unitdata_ind) + sizeof (sin_t); 2640 if (recv_ancillary.crb_all != 0) { 2641 udi_size += conn_recvancillary_size(connp, 2642 recv_ancillary, ira, mp, &ipps); 2643 } 2644 2645 /* Allocate a message block for the T_UNITDATA_IND structure. */ 2646 mp1 = allocb(udi_size, BPRI_MED); 2647 if (mp1 == NULL) { 2648 freemsg(mp); 2649 BUMP_MIB(&is->is_rawip_mib, rawipInErrors); 2650 return; 2651 } 2652 mp1->b_cont = mp; 2653 tudi = (struct T_unitdata_ind *)mp1->b_rptr; 2654 mp1->b_datap->db_type = M_PROTO; 2655 mp1->b_wptr = (uchar_t *)tudi + udi_size; 2656 tudi->PRIM_type = T_UNITDATA_IND; 2657 tudi->SRC_length = sizeof (sin_t); 2658 tudi->SRC_offset = sizeof (struct T_unitdata_ind); 2659 sin = (sin_t *)&tudi[1]; 2660 *sin = sin_null; 2661 sin->sin_family = AF_INET; 2662 sin->sin_addr.s_addr = ipha->ipha_src; 2663 *(uint32_t *)&sin->sin_zero[0] = 0; 2664 *(uint32_t *)&sin->sin_zero[4] = 0; 2665 tudi->OPT_offset = sizeof (struct T_unitdata_ind) + 2666 sizeof (sin_t); 2667 udi_size -= (sizeof (struct T_unitdata_ind) + sizeof (sin_t)); 2668 tudi->OPT_length = udi_size; 2669 2670 /* 2671 * Add options if IP_RECVIF etc is set 2672 */ 2673 if (udi_size != 0) { 2674 conn_recvancillary_add(connp, recv_ancillary, ira, 2675 &ipps, (uchar_t *)&sin[1], udi_size); 2676 } 2677 goto deliver; 2678 } 2679 2680 ASSERT(IPH_HDR_VERSION(rptr) == IPV6_VERSION); 2681 /* 2682 * IPv6 packets can only be received by applications 2683 * that are prepared to receive IPv6 addresses. 2684 * The IP fanout must ensure this. 2685 */ 2686 ASSERT(connp->conn_family == AF_INET6); 2687 2688 /* 2689 * Handle IPv6 packets. We don't pass up the IP headers with the 2690 * payload for IPv6. 2691 */ 2692 2693 ip6h = (ip6_t *)rptr; 2694 if (recv_ancillary.crb_all != 0) { 2695 /* 2696 * Call on ip_find_hdr_v6 which gets individual lenghts of 2697 * extension headers (and pointers to them). 2698 */ 2699 uint8_t nexthdr; 2700 2701 /* We don't care about the length or nextheader. */ 2702 (void) ip_find_hdr_v6(mp, ip6h, B_TRUE, &ipps, &nexthdr); 2703 2704 /* 2705 * We do not pass up hop-by-hop options or any other 2706 * extension header as part of the packet. Applications 2707 * that want to see them have to specify IPV6_RECV* socket 2708 * options. And conn_recvancillary_size/add explicitly 2709 * drops the TX option from IPV6_HOPOPTS as it does for UDP. 2710 * 2711 * If we had multilevel ICMP sockets, then we'd want to 2712 * modify conn_recvancillary_size/add to 2713 * allow the user to see the label. 2714 */ 2715 } 2716 2717 /* 2718 * Check a filter for ICMPv6 types if needed. 2719 * Verify raw checksums if needed. 2720 */ 2721 mutex_enter(&connp->conn_lock); 2722 if (icmp->icmp_filter != NULL) { 2723 int type; 2724 2725 /* Assumes that IP has done the pullupmsg */ 2726 type = mp->b_rptr[ip_hdr_length]; 2727 2728 ASSERT(mp->b_rptr + ip_hdr_length <= mp->b_wptr); 2729 if (ICMP6_FILTER_WILLBLOCK(type, icmp->icmp_filter)) { 2730 mutex_exit(&connp->conn_lock); 2731 freemsg(mp); 2732 return; 2733 } 2734 } 2735 if (connp->conn_ixa->ixa_flags & IXAF_SET_RAW_CKSUM) { 2736 /* Checksum */ 2737 uint16_t *up; 2738 uint32_t sum; 2739 int remlen; 2740 2741 up = (uint16_t *)&ip6h->ip6_src; 2742 2743 remlen = msgdsize(mp) - ip_hdr_length; 2744 sum = htons(connp->conn_proto + remlen) 2745 + up[0] + up[1] + up[2] + up[3] 2746 + up[4] + up[5] + up[6] + up[7] 2747 + up[8] + up[9] + up[10] + up[11] 2748 + up[12] + up[13] + up[14] + up[15]; 2749 sum = (sum & 0xffff) + (sum >> 16); 2750 sum = IP_CSUM(mp, ip_hdr_length, sum); 2751 if (sum != 0) { 2752 /* IPv6 RAW checksum failed */ 2753 ip0dbg(("icmp_rput: RAW checksum failed %x\n", sum)); 2754 mutex_exit(&connp->conn_lock); 2755 freemsg(mp); 2756 BUMP_MIB(&is->is_rawip_mib, rawipInCksumErrs); 2757 return; 2758 } 2759 } 2760 mutex_exit(&connp->conn_lock); 2761 2762 udi_size = sizeof (struct T_unitdata_ind) + sizeof (sin6_t); 2763 2764 if (recv_ancillary.crb_all != 0) { 2765 udi_size += conn_recvancillary_size(connp, 2766 recv_ancillary, ira, mp, &ipps); 2767 } 2768 2769 mp1 = allocb(udi_size, BPRI_MED); 2770 if (mp1 == NULL) { 2771 freemsg(mp); 2772 BUMP_MIB(&is->is_rawip_mib, rawipInErrors); 2773 return; 2774 } 2775 mp1->b_cont = mp; 2776 mp1->b_datap->db_type = M_PROTO; 2777 tudi = (struct T_unitdata_ind *)mp1->b_rptr; 2778 mp1->b_wptr = (uchar_t *)tudi + udi_size; 2779 tudi->PRIM_type = T_UNITDATA_IND; 2780 tudi->SRC_length = sizeof (sin6_t); 2781 tudi->SRC_offset = sizeof (struct T_unitdata_ind); 2782 tudi->OPT_offset = sizeof (struct T_unitdata_ind) + sizeof (sin6_t); 2783 udi_size -= (sizeof (struct T_unitdata_ind) + sizeof (sin6_t)); 2784 tudi->OPT_length = udi_size; 2785 sin6 = (sin6_t *)&tudi[1]; 2786 *sin6 = sin6_null; 2787 sin6->sin6_port = 0; 2788 sin6->sin6_family = AF_INET6; 2789 2790 sin6->sin6_addr = ip6h->ip6_src; 2791 /* No sin6_flowinfo per API */ 2792 sin6->sin6_flowinfo = 0; 2793 /* For link-scope pass up scope id */ 2794 if (IN6_IS_ADDR_LINKSCOPE(&ip6h->ip6_src)) 2795 sin6->sin6_scope_id = ira->ira_ruifindex; 2796 else 2797 sin6->sin6_scope_id = 0; 2798 sin6->__sin6_src_id = ip_srcid_find_addr(&ip6h->ip6_dst, 2799 IPCL_ZONEID(connp), is->is_netstack); 2800 2801 if (udi_size != 0) { 2802 conn_recvancillary_add(connp, recv_ancillary, ira, 2803 &ipps, (uchar_t *)&sin6[1], udi_size); 2804 } 2805 2806 /* Skip all the IPv6 headers per API */ 2807 mp->b_rptr += ip_hdr_length; 2808 pkt_len -= ip_hdr_length; 2809 2810 deliver: 2811 BUMP_MIB(&is->is_rawip_mib, rawipInDatagrams); 2812 icmp_ulp_recv(connp, mp1, pkt_len); 2813 } 2814 2815 /* 2816 * return SNMP stuff in buffer in mpdata. We don't hold any lock and report 2817 * information that can be changing beneath us. 2818 */ 2819 mblk_t * 2820 icmp_snmp_get(queue_t *q, mblk_t *mpctl) 2821 { 2822 mblk_t *mpdata; 2823 struct opthdr *optp; 2824 conn_t *connp = Q_TO_CONN(q); 2825 icmp_stack_t *is = connp->conn_netstack->netstack_icmp; 2826 mblk_t *mp2ctl; 2827 2828 /* 2829 * make a copy of the original message 2830 */ 2831 mp2ctl = copymsg(mpctl); 2832 2833 if (mpctl == NULL || 2834 (mpdata = mpctl->b_cont) == NULL) { 2835 freemsg(mpctl); 2836 freemsg(mp2ctl); 2837 return (0); 2838 } 2839 2840 /* fixed length structure for IPv4 and IPv6 counters */ 2841 optp = (struct opthdr *)&mpctl->b_rptr[sizeof (struct T_optmgmt_ack)]; 2842 optp->level = EXPER_RAWIP; 2843 optp->name = 0; 2844 (void) snmp_append_data(mpdata, (char *)&is->is_rawip_mib, 2845 sizeof (is->is_rawip_mib)); 2846 optp->len = msgdsize(mpdata); 2847 qreply(q, mpctl); 2848 2849 return (mp2ctl); 2850 } 2851 2852 /* 2853 * Return 0 if invalid set request, 1 otherwise, including non-rawip requests. 2854 * TODO: If this ever actually tries to set anything, it needs to be 2855 * to do the appropriate locking. 2856 */ 2857 /* ARGSUSED */ 2858 int 2859 icmp_snmp_set(queue_t *q, t_scalar_t level, t_scalar_t name, 2860 uchar_t *ptr, int len) 2861 { 2862 switch (level) { 2863 case EXPER_RAWIP: 2864 return (0); 2865 default: 2866 return (1); 2867 } 2868 } 2869 2870 /* 2871 * This routine creates a T_UDERROR_IND message and passes it upstream. 2872 * The address and options are copied from the T_UNITDATA_REQ message 2873 * passed in mp. This message is freed. 2874 */ 2875 static void 2876 icmp_ud_err(queue_t *q, mblk_t *mp, t_scalar_t err) 2877 { 2878 struct T_unitdata_req *tudr; 2879 mblk_t *mp1; 2880 uchar_t *destaddr; 2881 t_scalar_t destlen; 2882 uchar_t *optaddr; 2883 t_scalar_t optlen; 2884 2885 if ((mp->b_wptr < mp->b_rptr) || 2886 (MBLKL(mp)) < sizeof (struct T_unitdata_req)) { 2887 goto done; 2888 } 2889 tudr = (struct T_unitdata_req *)mp->b_rptr; 2890 destaddr = mp->b_rptr + tudr->DEST_offset; 2891 if (destaddr < mp->b_rptr || destaddr >= mp->b_wptr || 2892 destaddr + tudr->DEST_length < mp->b_rptr || 2893 destaddr + tudr->DEST_length > mp->b_wptr) { 2894 goto done; 2895 } 2896 optaddr = mp->b_rptr + tudr->OPT_offset; 2897 if (optaddr < mp->b_rptr || optaddr >= mp->b_wptr || 2898 optaddr + tudr->OPT_length < mp->b_rptr || 2899 optaddr + tudr->OPT_length > mp->b_wptr) { 2900 goto done; 2901 } 2902 destlen = tudr->DEST_length; 2903 optlen = tudr->OPT_length; 2904 2905 mp1 = mi_tpi_uderror_ind((char *)destaddr, destlen, 2906 (char *)optaddr, optlen, err); 2907 if (mp1 != NULL) 2908 qreply(q, mp1); 2909 2910 done: 2911 freemsg(mp); 2912 } 2913 2914 static int 2915 rawip_do_unbind(conn_t *connp) 2916 { 2917 icmp_t *icmp = connp->conn_icmp; 2918 2919 mutex_enter(&connp->conn_lock); 2920 /* If a bind has not been done, we can't unbind. */ 2921 if (icmp->icmp_state == TS_UNBND) { 2922 mutex_exit(&connp->conn_lock); 2923 return (-TOUTSTATE); 2924 } 2925 connp->conn_saddr_v6 = ipv6_all_zeros; 2926 connp->conn_bound_addr_v6 = ipv6_all_zeros; 2927 connp->conn_laddr_v6 = ipv6_all_zeros; 2928 connp->conn_mcbc_bind = B_FALSE; 2929 connp->conn_lport = 0; 2930 connp->conn_fport = 0; 2931 /* In case we were also connected */ 2932 connp->conn_faddr_v6 = ipv6_all_zeros; 2933 connp->conn_v6lastdst = ipv6_all_zeros; 2934 2935 icmp->icmp_state = TS_UNBND; 2936 2937 (void) icmp_build_hdr_template(connp, &connp->conn_saddr_v6, 2938 &connp->conn_faddr_v6, connp->conn_flowinfo); 2939 mutex_exit(&connp->conn_lock); 2940 2941 ip_unbind(connp); 2942 return (0); 2943 } 2944 2945 /* 2946 * This routine is called by icmp_wput to handle T_UNBIND_REQ messages. 2947 * After some error checking, the message is passed downstream to ip. 2948 */ 2949 static void 2950 icmp_tpi_unbind(queue_t *q, mblk_t *mp) 2951 { 2952 conn_t *connp = Q_TO_CONN(q); 2953 int error; 2954 2955 ASSERT(mp->b_cont == NULL); 2956 error = rawip_do_unbind(connp); 2957 if (error) { 2958 if (error < 0) { 2959 icmp_err_ack(q, mp, -error, 0); 2960 } else { 2961 icmp_err_ack(q, mp, 0, error); 2962 } 2963 return; 2964 } 2965 2966 /* 2967 * Convert mp into a T_OK_ACK 2968 */ 2969 2970 mp = mi_tpi_ok_ack_alloc(mp); 2971 2972 /* 2973 * should not happen in practice... T_OK_ACK is smaller than the 2974 * original message. 2975 */ 2976 ASSERT(mp != NULL); 2977 ASSERT(((struct T_ok_ack *)mp->b_rptr)->PRIM_type == T_OK_ACK); 2978 qreply(q, mp); 2979 } 2980 2981 /* 2982 * Process IPv4 packets that already include an IP header. 2983 * Used when IP_HDRINCL has been set (implicit for IPPROTO_RAW and 2984 * IPPROTO_IGMP). 2985 * In this case we ignore the address and any options in the T_UNITDATA_REQ. 2986 * 2987 * The packet is assumed to have a base (20 byte) IP header followed 2988 * by the upper-layer protocol. We include any IP_OPTIONS including a 2989 * CIPSO label but otherwise preserve the base IP header. 2990 */ 2991 static int 2992 icmp_output_hdrincl(conn_t *connp, mblk_t *mp, cred_t *cr, pid_t pid) 2993 { 2994 icmp_t *icmp = connp->conn_icmp; 2995 icmp_stack_t *is = icmp->icmp_is; 2996 ipha_t iphas; 2997 ipha_t *ipha; 2998 int ip_hdr_length; 2999 int tp_hdr_len; 3000 ip_xmit_attr_t *ixa; 3001 ip_pkt_t *ipp; 3002 in6_addr_t v6src; 3003 in6_addr_t v6dst; 3004 in6_addr_t v6nexthop; 3005 int error; 3006 boolean_t do_ipsec; 3007 3008 /* 3009 * We need an exclusive copy of conn_ixa since the included IP 3010 * header could have any destination. 3011 * That copy has no pointers hence we 3012 * need to set them up once we've parsed the ancillary data. 3013 */ 3014 ixa = conn_get_ixa_exclusive(connp); 3015 if (ixa == NULL) { 3016 BUMP_MIB(&is->is_rawip_mib, rawipOutErrors); 3017 freemsg(mp); 3018 return (ENOMEM); 3019 } 3020 ASSERT(cr != NULL); 3021 /* 3022 * Caller has a reference on cr; from db_credp or because we 3023 * are running in process context. 3024 */ 3025 ASSERT(!(ixa->ixa_free_flags & IXA_FREE_CRED)); 3026 ixa->ixa_cred = cr; 3027 ixa->ixa_cpid = pid; 3028 if (is_system_labeled()) { 3029 /* We need to restart with a label based on the cred */ 3030 ip_xmit_attr_restore_tsl(ixa, ixa->ixa_cred); 3031 } 3032 3033 /* In case previous destination was multicast or multirt */ 3034 ip_attr_newdst(ixa); 3035 3036 /* Get a copy of conn_xmit_ipp since the TX label might change it */ 3037 ipp = kmem_zalloc(sizeof (*ipp), KM_NOSLEEP); 3038 if (ipp == NULL) { 3039 ASSERT(!(ixa->ixa_free_flags & IXA_FREE_CRED)); 3040 ixa->ixa_cred = connp->conn_cred; /* Restore */ 3041 ixa->ixa_cpid = connp->conn_cpid; 3042 ixa_refrele(ixa); 3043 BUMP_MIB(&is->is_rawip_mib, rawipOutErrors); 3044 freemsg(mp); 3045 return (ENOMEM); 3046 } 3047 mutex_enter(&connp->conn_lock); 3048 error = ip_pkt_copy(&connp->conn_xmit_ipp, ipp, KM_NOSLEEP); 3049 mutex_exit(&connp->conn_lock); 3050 if (error != 0) { 3051 BUMP_MIB(&is->is_rawip_mib, rawipOutErrors); 3052 freemsg(mp); 3053 goto done; 3054 } 3055 3056 /* Sanity check length of packet */ 3057 ipha = (ipha_t *)mp->b_rptr; 3058 3059 ip_hdr_length = IP_SIMPLE_HDR_LENGTH; 3060 if ((mp->b_wptr - mp->b_rptr) < IP_SIMPLE_HDR_LENGTH) { 3061 if (!pullupmsg(mp, IP_SIMPLE_HDR_LENGTH)) { 3062 BUMP_MIB(&is->is_rawip_mib, rawipOutErrors); 3063 freemsg(mp); 3064 goto done; 3065 } 3066 ipha = (ipha_t *)mp->b_rptr; 3067 } 3068 ipha->ipha_version_and_hdr_length = 3069 (IP_VERSION<<4) | (ip_hdr_length>>2); 3070 3071 /* 3072 * We set IXAF_DONTFRAG if the application set DF which makes 3073 * IP not fragment. 3074 */ 3075 ipha->ipha_fragment_offset_and_flags &= htons(IPH_DF); 3076 if (ipha->ipha_fragment_offset_and_flags & htons(IPH_DF)) 3077 ixa->ixa_flags |= (IXAF_DONTFRAG | IXAF_PMTU_IPV4_DF); 3078 else 3079 ixa->ixa_flags &= ~(IXAF_DONTFRAG | IXAF_PMTU_IPV4_DF); 3080 3081 /* Even for multicast and broadcast we honor the apps ttl */ 3082 ixa->ixa_flags |= IXAF_NO_TTL_CHANGE; 3083 3084 /* 3085 * No source verification for non-local addresses 3086 */ 3087 if (ipha->ipha_src != INADDR_ANY && 3088 ip_laddr_verify_v4(ipha->ipha_src, ixa->ixa_zoneid, 3089 is->is_netstack->netstack_ip, B_FALSE) 3090 != IPVL_UNICAST_UP) { 3091 ixa->ixa_flags &= ~IXAF_VERIFY_SOURCE; 3092 } 3093 3094 if (ipha->ipha_dst == INADDR_ANY) 3095 ipha->ipha_dst = htonl(INADDR_LOOPBACK); 3096 3097 IN6_IPADDR_TO_V4MAPPED(ipha->ipha_src, &v6src); 3098 IN6_IPADDR_TO_V4MAPPED(ipha->ipha_dst, &v6dst); 3099 3100 /* Defer IPsec if it might need to look at ICMP type/code */ 3101 do_ipsec = ipha->ipha_protocol != IPPROTO_ICMP; 3102 ixa->ixa_flags |= IXAF_IS_IPV4; 3103 3104 ip_attr_nexthop(ipp, ixa, &v6dst, &v6nexthop); 3105 error = ip_attr_connect(connp, ixa, &v6src, &v6dst, &v6nexthop, 3106 connp->conn_fport, &v6src, NULL, IPDF_ALLOW_MCBC | IPDF_VERIFY_DST | 3107 (do_ipsec ? IPDF_IPSEC : 0)); 3108 switch (error) { 3109 case 0: 3110 break; 3111 case EADDRNOTAVAIL: 3112 /* 3113 * IXAF_VERIFY_SOURCE tells us to pick a better source. 3114 * Don't have the application see that errno 3115 */ 3116 error = ENETUNREACH; 3117 goto failed; 3118 case ENETDOWN: 3119 /* 3120 * Have !ipif_addr_ready address; drop packet silently 3121 * until we can get applications to not send until we 3122 * are ready. 3123 */ 3124 error = 0; 3125 goto failed; 3126 case EHOSTUNREACH: 3127 case ENETUNREACH: 3128 if (ixa->ixa_ire != NULL) { 3129 /* 3130 * Let conn_ip_output/ire_send_noroute return 3131 * the error and send any local ICMP error. 3132 */ 3133 error = 0; 3134 break; 3135 } 3136 /* FALLTHRU */ 3137 default: 3138 failed: 3139 freemsg(mp); 3140 BUMP_MIB(&is->is_rawip_mib, rawipOutErrors); 3141 goto done; 3142 } 3143 if (ipha->ipha_src == INADDR_ANY) 3144 IN6_V4MAPPED_TO_IPADDR(&v6src, ipha->ipha_src); 3145 3146 /* 3147 * We might be going to a different destination than last time, 3148 * thus check that TX allows the communication and compute any 3149 * needed label. 3150 * 3151 * TSOL Note: We have an exclusive ipp and ixa for this thread so we 3152 * don't have to worry about concurrent threads. 3153 */ 3154 if (is_system_labeled()) { 3155 /* 3156 * Check whether Trusted Solaris policy allows communication 3157 * with this host, and pretend that the destination is 3158 * unreachable if not. 3159 * Compute any needed label and place it in ipp_label_v4/v6. 3160 * 3161 * Later conn_build_hdr_template/conn_prepend_hdr takes 3162 * ipp_label_v4/v6 to form the packet. 3163 * 3164 * Tsol note: We have ipp structure local to this thread so 3165 * no locking is needed. 3166 */ 3167 error = conn_update_label(connp, ixa, &v6dst, ipp); 3168 if (error != 0) { 3169 freemsg(mp); 3170 BUMP_MIB(&is->is_rawip_mib, rawipOutErrors); 3171 goto done; 3172 } 3173 } 3174 3175 /* 3176 * Save away a copy of the IPv4 header the application passed down 3177 * and then prepend an IPv4 header complete with any IP options 3178 * including label. 3179 * We need a struct copy since icmp_prepend_hdr will reuse the available 3180 * space in the mblk. 3181 */ 3182 iphas = *ipha; 3183 mp->b_rptr += IP_SIMPLE_HDR_LENGTH; 3184 3185 mp = icmp_prepend_hdr(connp, ixa, ipp, &v6src, &v6dst, 0, mp, &error); 3186 if (mp == NULL) { 3187 BUMP_MIB(&is->is_rawip_mib, rawipOutErrors); 3188 ASSERT(error != 0); 3189 goto done; 3190 } 3191 if (ixa->ixa_pktlen > IP_MAXPACKET) { 3192 error = EMSGSIZE; 3193 BUMP_MIB(&is->is_rawip_mib, rawipOutErrors); 3194 freemsg(mp); 3195 goto done; 3196 } 3197 /* Restore key parts of the header that the application passed down */ 3198 ipha = (ipha_t *)mp->b_rptr; 3199 ipha->ipha_type_of_service = iphas.ipha_type_of_service; 3200 ipha->ipha_ident = iphas.ipha_ident; 3201 ipha->ipha_fragment_offset_and_flags = 3202 iphas.ipha_fragment_offset_and_flags; 3203 ipha->ipha_ttl = iphas.ipha_ttl; 3204 ipha->ipha_protocol = iphas.ipha_protocol; 3205 ipha->ipha_src = iphas.ipha_src; 3206 ipha->ipha_dst = iphas.ipha_dst; 3207 3208 ixa->ixa_protocol = ipha->ipha_protocol; 3209 3210 /* 3211 * Make sure that the IP header plus any transport header that is 3212 * checksumed by ip_output is in the first mblk. (ip_output assumes 3213 * that at least the checksum field is in the first mblk.) 3214 */ 3215 switch (ipha->ipha_protocol) { 3216 case IPPROTO_UDP: 3217 tp_hdr_len = 8; 3218 break; 3219 case IPPROTO_TCP: 3220 tp_hdr_len = 20; 3221 break; 3222 default: 3223 tp_hdr_len = 0; 3224 break; 3225 } 3226 ip_hdr_length = IPH_HDR_LENGTH(ipha); 3227 if (mp->b_wptr - mp->b_rptr < ip_hdr_length + tp_hdr_len) { 3228 if (!pullupmsg(mp, ip_hdr_length + tp_hdr_len)) { 3229 BUMP_MIB(&is->is_rawip_mib, rawipOutErrors); 3230 if (mp->b_cont == NULL) 3231 error = EINVAL; 3232 else 3233 error = ENOMEM; 3234 freemsg(mp); 3235 goto done; 3236 } 3237 } 3238 3239 if (!do_ipsec) { 3240 /* Policy might differ for different ICMP type/code */ 3241 if (ixa->ixa_ipsec_policy != NULL) { 3242 IPPOL_REFRELE(ixa->ixa_ipsec_policy); 3243 ixa->ixa_ipsec_policy = NULL; 3244 ixa->ixa_flags &= ~IXAF_IPSEC_SECURE; 3245 } 3246 mp = ip_output_attach_policy(mp, ipha, NULL, connp, ixa); 3247 if (mp == NULL) { 3248 BUMP_MIB(&is->is_rawip_mib, rawipOutErrors); 3249 error = EHOSTUNREACH; /* IPsec policy failure */ 3250 goto done; 3251 } 3252 } 3253 3254 /* We're done. Pass the packet to ip. */ 3255 BUMP_MIB(&is->is_rawip_mib, rawipOutDatagrams); 3256 3257 error = conn_ip_output(mp, ixa); 3258 /* No rawipOutErrors if an error since IP increases its error counter */ 3259 switch (error) { 3260 case 0: 3261 break; 3262 case EWOULDBLOCK: 3263 (void) ixa_check_drain_insert(connp, ixa); 3264 error = 0; 3265 break; 3266 case EADDRNOTAVAIL: 3267 /* 3268 * IXAF_VERIFY_SOURCE tells us to pick a better source. 3269 * Don't have the application see that errno 3270 */ 3271 error = ENETUNREACH; 3272 break; 3273 } 3274 done: 3275 ASSERT(!(ixa->ixa_free_flags & IXA_FREE_CRED)); 3276 ixa->ixa_cred = connp->conn_cred; /* Restore */ 3277 ixa->ixa_cpid = connp->conn_cpid; 3278 ixa_refrele(ixa); 3279 ip_pkt_free(ipp); 3280 kmem_free(ipp, sizeof (*ipp)); 3281 return (error); 3282 } 3283 3284 static mblk_t * 3285 icmp_output_attach_policy(mblk_t *mp, conn_t *connp, ip_xmit_attr_t *ixa) 3286 { 3287 ipha_t *ipha = NULL; 3288 ip6_t *ip6h = NULL; 3289 3290 if (ixa->ixa_flags & IXAF_IS_IPV4) 3291 ipha = (ipha_t *)mp->b_rptr; 3292 else 3293 ip6h = (ip6_t *)mp->b_rptr; 3294 3295 if (ixa->ixa_ipsec_policy != NULL) { 3296 IPPOL_REFRELE(ixa->ixa_ipsec_policy); 3297 ixa->ixa_ipsec_policy = NULL; 3298 ixa->ixa_flags &= ~IXAF_IPSEC_SECURE; 3299 } 3300 return (ip_output_attach_policy(mp, ipha, ip6h, connp, ixa)); 3301 } 3302 3303 /* 3304 * Handle T_UNITDATA_REQ with options. Both IPv4 and IPv6 3305 * Either tudr_mp or msg is set. If tudr_mp we take ancillary data from 3306 * the TPI options, otherwise we take them from msg_control. 3307 * If both sin and sin6 is set it is a connected socket and we use conn_faddr. 3308 * Always consumes mp; never consumes tudr_mp. 3309 */ 3310 static int 3311 icmp_output_ancillary(conn_t *connp, sin_t *sin, sin6_t *sin6, mblk_t *mp, 3312 mblk_t *tudr_mp, struct nmsghdr *msg, cred_t *cr, pid_t pid) 3313 { 3314 icmp_t *icmp = connp->conn_icmp; 3315 icmp_stack_t *is = icmp->icmp_is; 3316 int error; 3317 ip_xmit_attr_t *ixa; 3318 ip_pkt_t *ipp; 3319 in6_addr_t v6src; 3320 in6_addr_t v6dst; 3321 in6_addr_t v6nexthop; 3322 in_port_t dstport; 3323 uint32_t flowinfo; 3324 uint_t srcid; 3325 int is_absreq_failure = 0; 3326 conn_opt_arg_t coas, *coa; 3327 3328 ASSERT(tudr_mp != NULL || msg != NULL); 3329 3330 /* 3331 * Get ixa before checking state to handle a disconnect race. 3332 * 3333 * We need an exclusive copy of conn_ixa since the ancillary data 3334 * options might modify it. That copy has no pointers hence we 3335 * need to set them up once we've parsed the ancillary data. 3336 */ 3337 ixa = conn_get_ixa_exclusive(connp); 3338 if (ixa == NULL) { 3339 BUMP_MIB(&is->is_rawip_mib, rawipOutErrors); 3340 freemsg(mp); 3341 return (ENOMEM); 3342 } 3343 ASSERT(cr != NULL); 3344 ASSERT(!(ixa->ixa_free_flags & IXA_FREE_CRED)); 3345 ixa->ixa_cred = cr; 3346 ixa->ixa_cpid = pid; 3347 if (is_system_labeled()) { 3348 /* We need to restart with a label based on the cred */ 3349 ip_xmit_attr_restore_tsl(ixa, ixa->ixa_cred); 3350 } 3351 3352 /* In case previous destination was multicast or multirt */ 3353 ip_attr_newdst(ixa); 3354 3355 /* Get a copy of conn_xmit_ipp since the options might change it */ 3356 ipp = kmem_zalloc(sizeof (*ipp), KM_NOSLEEP); 3357 if (ipp == NULL) { 3358 ASSERT(!(ixa->ixa_free_flags & IXA_FREE_CRED)); 3359 ixa->ixa_cred = connp->conn_cred; /* Restore */ 3360 ixa->ixa_cpid = connp->conn_cpid; 3361 ixa_refrele(ixa); 3362 BUMP_MIB(&is->is_rawip_mib, rawipOutErrors); 3363 freemsg(mp); 3364 return (ENOMEM); 3365 } 3366 mutex_enter(&connp->conn_lock); 3367 error = ip_pkt_copy(&connp->conn_xmit_ipp, ipp, KM_NOSLEEP); 3368 mutex_exit(&connp->conn_lock); 3369 if (error != 0) { 3370 BUMP_MIB(&is->is_rawip_mib, rawipOutErrors); 3371 freemsg(mp); 3372 goto done; 3373 } 3374 3375 /* 3376 * Parse the options and update ixa and ipp as a result. 3377 */ 3378 3379 coa = &coas; 3380 coa->coa_connp = connp; 3381 coa->coa_ixa = ixa; 3382 coa->coa_ipp = ipp; 3383 coa->coa_ancillary = B_TRUE; 3384 coa->coa_changed = 0; 3385 3386 if (msg != NULL) { 3387 error = process_auxiliary_options(connp, msg->msg_control, 3388 msg->msg_controllen, coa, &icmp_opt_obj, icmp_opt_set, cr); 3389 } else { 3390 struct T_unitdata_req *tudr; 3391 3392 tudr = (struct T_unitdata_req *)tudr_mp->b_rptr; 3393 ASSERT(tudr->PRIM_type == T_UNITDATA_REQ); 3394 error = tpi_optcom_buf(connp->conn_wq, tudr_mp, 3395 &tudr->OPT_length, tudr->OPT_offset, cr, &icmp_opt_obj, 3396 coa, &is_absreq_failure); 3397 } 3398 if (error != 0) { 3399 /* 3400 * Note: No special action needed in this 3401 * module for "is_absreq_failure" 3402 */ 3403 freemsg(mp); 3404 BUMP_MIB(&is->is_rawip_mib, rawipOutErrors); 3405 goto done; 3406 } 3407 ASSERT(is_absreq_failure == 0); 3408 3409 mutex_enter(&connp->conn_lock); 3410 /* 3411 * If laddr is unspecified then we look at sin6_src_id. 3412 * We will give precedence to a source address set with IPV6_PKTINFO 3413 * (aka IPPF_ADDR) but that is handled in build_hdrs. However, we don't 3414 * want ip_attr_connect to select a source (since it can fail) when 3415 * IPV6_PKTINFO is specified. 3416 * If this doesn't result in a source address then we get a source 3417 * from ip_attr_connect() below. 3418 */ 3419 v6src = connp->conn_saddr_v6; 3420 if (sin != NULL) { 3421 IN6_IPADDR_TO_V4MAPPED(sin->sin_addr.s_addr, &v6dst); 3422 dstport = sin->sin_port; 3423 flowinfo = 0; 3424 ixa->ixa_flags &= ~IXAF_SCOPEID_SET; 3425 ixa->ixa_flags |= IXAF_IS_IPV4; 3426 } else if (sin6 != NULL) { 3427 v6dst = sin6->sin6_addr; 3428 dstport = sin6->sin6_port; 3429 flowinfo = sin6->sin6_flowinfo; 3430 srcid = sin6->__sin6_src_id; 3431 if (IN6_IS_ADDR_LINKSCOPE(&v6dst) && sin6->sin6_scope_id != 0) { 3432 ixa->ixa_scopeid = sin6->sin6_scope_id; 3433 ixa->ixa_flags |= IXAF_SCOPEID_SET; 3434 } else { 3435 ixa->ixa_flags &= ~IXAF_SCOPEID_SET; 3436 } 3437 if (srcid != 0 && IN6_IS_ADDR_UNSPECIFIED(&v6src)) { 3438 ip_srcid_find_id(srcid, &v6src, IPCL_ZONEID(connp), 3439 connp->conn_netstack); 3440 } 3441 if (IN6_IS_ADDR_V4MAPPED(&v6dst)) 3442 ixa->ixa_flags |= IXAF_IS_IPV4; 3443 else 3444 ixa->ixa_flags &= ~IXAF_IS_IPV4; 3445 } else { 3446 /* Connected case */ 3447 v6dst = connp->conn_faddr_v6; 3448 flowinfo = connp->conn_flowinfo; 3449 } 3450 mutex_exit(&connp->conn_lock); 3451 /* Handle IP_PKTINFO/IPV6_PKTINFO setting source address. */ 3452 if (ipp->ipp_fields & IPPF_ADDR) { 3453 if (ixa->ixa_flags & IXAF_IS_IPV4) { 3454 if (IN6_IS_ADDR_V4MAPPED(&ipp->ipp_addr)) 3455 v6src = ipp->ipp_addr; 3456 } else { 3457 if (!IN6_IS_ADDR_V4MAPPED(&ipp->ipp_addr)) 3458 v6src = ipp->ipp_addr; 3459 } 3460 } 3461 /* 3462 * Allow source not assigned to the system 3463 * only if it is not a local addresses 3464 */ 3465 if (!V6_OR_V4_INADDR_ANY(v6src)) { 3466 ip_laddr_t laddr_type; 3467 3468 if (ixa->ixa_flags & IXAF_IS_IPV4) { 3469 ipaddr_t v4src; 3470 3471 IN6_V4MAPPED_TO_IPADDR(&v6src, v4src); 3472 laddr_type = ip_laddr_verify_v4(v4src, ixa->ixa_zoneid, 3473 is->is_netstack->netstack_ip, B_FALSE); 3474 } else { 3475 laddr_type = ip_laddr_verify_v6(&v6src, ixa->ixa_zoneid, 3476 is->is_netstack->netstack_ip, B_FALSE, B_FALSE); 3477 } 3478 if (laddr_type != IPVL_UNICAST_UP) 3479 ixa->ixa_flags &= ~IXAF_VERIFY_SOURCE; 3480 } 3481 3482 ip_attr_nexthop(ipp, ixa, &v6dst, &v6nexthop); 3483 error = ip_attr_connect(connp, ixa, &v6src, &v6dst, &v6nexthop, dstport, 3484 &v6src, NULL, IPDF_ALLOW_MCBC | IPDF_VERIFY_DST); 3485 3486 switch (error) { 3487 case 0: 3488 break; 3489 case EADDRNOTAVAIL: 3490 /* 3491 * IXAF_VERIFY_SOURCE tells us to pick a better source. 3492 * Don't have the application see that errno 3493 */ 3494 error = ENETUNREACH; 3495 goto failed; 3496 case ENETDOWN: 3497 /* 3498 * Have !ipif_addr_ready address; drop packet silently 3499 * until we can get applications to not send until we 3500 * are ready. 3501 */ 3502 error = 0; 3503 goto failed; 3504 case EHOSTUNREACH: 3505 case ENETUNREACH: 3506 if (ixa->ixa_ire != NULL) { 3507 /* 3508 * Let conn_ip_output/ire_send_noroute return 3509 * the error and send any local ICMP error. 3510 */ 3511 error = 0; 3512 break; 3513 } 3514 /* FALLTHRU */ 3515 default: 3516 failed: 3517 freemsg(mp); 3518 BUMP_MIB(&is->is_rawip_mib, rawipOutErrors); 3519 goto done; 3520 } 3521 3522 /* 3523 * We might be going to a different destination than last time, 3524 * thus check that TX allows the communication and compute any 3525 * needed label. 3526 * 3527 * TSOL Note: We have an exclusive ipp and ixa for this thread so we 3528 * don't have to worry about concurrent threads. 3529 */ 3530 if (is_system_labeled()) { 3531 /* 3532 * Check whether Trusted Solaris policy allows communication 3533 * with this host, and pretend that the destination is 3534 * unreachable if not. 3535 * Compute any needed label and place it in ipp_label_v4/v6. 3536 * 3537 * Later conn_build_hdr_template/conn_prepend_hdr takes 3538 * ipp_label_v4/v6 to form the packet. 3539 * 3540 * Tsol note: We have ipp structure local to this thread so 3541 * no locking is needed. 3542 */ 3543 error = conn_update_label(connp, ixa, &v6dst, ipp); 3544 if (error != 0) { 3545 freemsg(mp); 3546 BUMP_MIB(&is->is_rawip_mib, rawipOutErrors); 3547 goto done; 3548 } 3549 } 3550 mp = icmp_prepend_hdr(connp, ixa, ipp, &v6src, &v6dst, flowinfo, mp, 3551 &error); 3552 if (mp == NULL) { 3553 BUMP_MIB(&is->is_rawip_mib, rawipOutErrors); 3554 ASSERT(error != 0); 3555 goto done; 3556 } 3557 if (ixa->ixa_pktlen > IP_MAXPACKET) { 3558 error = EMSGSIZE; 3559 BUMP_MIB(&is->is_rawip_mib, rawipOutErrors); 3560 freemsg(mp); 3561 goto done; 3562 } 3563 3564 /* Policy might differ for different ICMP type/code */ 3565 mp = icmp_output_attach_policy(mp, connp, ixa); 3566 if (mp == NULL) { 3567 BUMP_MIB(&is->is_rawip_mib, rawipOutErrors); 3568 error = EHOSTUNREACH; /* IPsec policy failure */ 3569 goto done; 3570 } 3571 3572 /* We're done. Pass the packet to ip. */ 3573 BUMP_MIB(&is->is_rawip_mib, rawipOutDatagrams); 3574 3575 error = conn_ip_output(mp, ixa); 3576 if (!connp->conn_unspec_src) 3577 ixa->ixa_flags |= IXAF_VERIFY_SOURCE; 3578 /* No rawipOutErrors if an error since IP increases its error counter */ 3579 switch (error) { 3580 case 0: 3581 break; 3582 case EWOULDBLOCK: 3583 (void) ixa_check_drain_insert(connp, ixa); 3584 error = 0; 3585 break; 3586 case EADDRNOTAVAIL: 3587 /* 3588 * IXAF_VERIFY_SOURCE tells us to pick a better source. 3589 * Don't have the application see that errno 3590 */ 3591 error = ENETUNREACH; 3592 /* FALLTHRU */ 3593 default: 3594 mutex_enter(&connp->conn_lock); 3595 /* 3596 * Clear the source and v6lastdst so we call ip_attr_connect 3597 * for the next packet and try to pick a better source. 3598 */ 3599 if (connp->conn_mcbc_bind) 3600 connp->conn_saddr_v6 = ipv6_all_zeros; 3601 else 3602 connp->conn_saddr_v6 = connp->conn_bound_addr_v6; 3603 connp->conn_v6lastdst = ipv6_all_zeros; 3604 mutex_exit(&connp->conn_lock); 3605 break; 3606 } 3607 done: 3608 ASSERT(!(ixa->ixa_free_flags & IXA_FREE_CRED)); 3609 ixa->ixa_cred = connp->conn_cred; /* Restore */ 3610 ixa->ixa_cpid = connp->conn_cpid; 3611 ixa_refrele(ixa); 3612 ip_pkt_free(ipp); 3613 kmem_free(ipp, sizeof (*ipp)); 3614 return (error); 3615 } 3616 3617 /* 3618 * Handle sending an M_DATA for a connected socket. 3619 * Handles both IPv4 and IPv6. 3620 */ 3621 int 3622 icmp_output_connected(conn_t *connp, mblk_t *mp, cred_t *cr, pid_t pid) 3623 { 3624 icmp_t *icmp = connp->conn_icmp; 3625 icmp_stack_t *is = icmp->icmp_is; 3626 int error; 3627 ip_xmit_attr_t *ixa; 3628 boolean_t do_ipsec; 3629 3630 /* 3631 * If no other thread is using conn_ixa this just gets a reference to 3632 * conn_ixa. Otherwise we get a safe copy of conn_ixa. 3633 */ 3634 ixa = conn_get_ixa(connp, B_FALSE); 3635 if (ixa == NULL) { 3636 BUMP_MIB(&is->is_rawip_mib, rawipOutErrors); 3637 freemsg(mp); 3638 return (ENOMEM); 3639 } 3640 3641 ASSERT(cr != NULL); 3642 ASSERT(!(ixa->ixa_free_flags & IXA_FREE_CRED)); 3643 ixa->ixa_cred = cr; 3644 ixa->ixa_cpid = pid; 3645 3646 /* Defer IPsec if it might need to look at ICMP type/code */ 3647 switch (ixa->ixa_protocol) { 3648 case IPPROTO_ICMP: 3649 case IPPROTO_ICMPV6: 3650 do_ipsec = B_FALSE; 3651 break; 3652 default: 3653 do_ipsec = B_TRUE; 3654 } 3655 3656 mutex_enter(&connp->conn_lock); 3657 mp = icmp_prepend_header_template(connp, ixa, mp, 3658 &connp->conn_saddr_v6, connp->conn_flowinfo, &error); 3659 3660 if (mp == NULL) { 3661 ASSERT(error != 0); 3662 mutex_exit(&connp->conn_lock); 3663 ASSERT(!(ixa->ixa_free_flags & IXA_FREE_CRED)); 3664 ixa->ixa_cred = connp->conn_cred; /* Restore */ 3665 ixa->ixa_cpid = connp->conn_cpid; 3666 ixa_refrele(ixa); 3667 BUMP_MIB(&is->is_rawip_mib, rawipOutErrors); 3668 freemsg(mp); 3669 return (error); 3670 } 3671 3672 if (!do_ipsec) { 3673 /* Policy might differ for different ICMP type/code */ 3674 mp = icmp_output_attach_policy(mp, connp, ixa); 3675 if (mp == NULL) { 3676 mutex_exit(&connp->conn_lock); 3677 BUMP_MIB(&is->is_rawip_mib, rawipOutErrors); 3678 ASSERT(!(ixa->ixa_free_flags & IXA_FREE_CRED)); 3679 ixa->ixa_cred = connp->conn_cred; /* Restore */ 3680 ixa->ixa_cpid = connp->conn_cpid; 3681 ixa_refrele(ixa); 3682 return (EHOSTUNREACH); /* IPsec policy failure */ 3683 } 3684 } 3685 3686 /* 3687 * In case we got a safe copy of conn_ixa, or if opt_set made us a new 3688 * safe copy, then we need to fill in any pointers in it. 3689 */ 3690 if (ixa->ixa_ire == NULL) { 3691 in6_addr_t faddr, saddr; 3692 in6_addr_t nexthop; 3693 in_port_t fport; 3694 3695 saddr = connp->conn_saddr_v6; 3696 faddr = connp->conn_faddr_v6; 3697 fport = connp->conn_fport; 3698 ip_attr_nexthop(&connp->conn_xmit_ipp, ixa, &faddr, &nexthop); 3699 mutex_exit(&connp->conn_lock); 3700 3701 error = ip_attr_connect(connp, ixa, &saddr, &faddr, &nexthop, 3702 fport, NULL, NULL, IPDF_ALLOW_MCBC | IPDF_VERIFY_DST | 3703 (do_ipsec ? IPDF_IPSEC : 0)); 3704 switch (error) { 3705 case 0: 3706 break; 3707 case EADDRNOTAVAIL: 3708 /* 3709 * IXAF_VERIFY_SOURCE tells us to pick a better source. 3710 * Don't have the application see that errno 3711 */ 3712 error = ENETUNREACH; 3713 goto failed; 3714 case ENETDOWN: 3715 /* 3716 * Have !ipif_addr_ready address; drop packet silently 3717 * until we can get applications to not send until we 3718 * are ready. 3719 */ 3720 error = 0; 3721 goto failed; 3722 case EHOSTUNREACH: 3723 case ENETUNREACH: 3724 if (ixa->ixa_ire != NULL) { 3725 /* 3726 * Let conn_ip_output/ire_send_noroute return 3727 * the error and send any local ICMP error. 3728 */ 3729 error = 0; 3730 break; 3731 } 3732 /* FALLTHRU */ 3733 default: 3734 failed: 3735 ASSERT(!(ixa->ixa_free_flags & IXA_FREE_CRED)); 3736 ixa->ixa_cred = connp->conn_cred; /* Restore */ 3737 ixa->ixa_cpid = connp->conn_cpid; 3738 ixa_refrele(ixa); 3739 BUMP_MIB(&is->is_rawip_mib, rawipOutErrors); 3740 freemsg(mp); 3741 return (error); 3742 } 3743 } else { 3744 /* Done with conn_t */ 3745 mutex_exit(&connp->conn_lock); 3746 } 3747 3748 /* We're done. Pass the packet to ip. */ 3749 BUMP_MIB(&is->is_rawip_mib, rawipOutDatagrams); 3750 3751 error = conn_ip_output(mp, ixa); 3752 /* No rawipOutErrors if an error since IP increases its error counter */ 3753 switch (error) { 3754 case 0: 3755 break; 3756 case EWOULDBLOCK: 3757 (void) ixa_check_drain_insert(connp, ixa); 3758 error = 0; 3759 break; 3760 case EADDRNOTAVAIL: 3761 /* 3762 * IXAF_VERIFY_SOURCE tells us to pick a better source. 3763 * Don't have the application see that errno 3764 */ 3765 error = ENETUNREACH; 3766 break; 3767 } 3768 ASSERT(!(ixa->ixa_free_flags & IXA_FREE_CRED)); 3769 ixa->ixa_cred = connp->conn_cred; /* Restore */ 3770 ixa->ixa_cpid = connp->conn_cpid; 3771 ixa_refrele(ixa); 3772 return (error); 3773 } 3774 3775 /* 3776 * Handle sending an M_DATA to the last destination. 3777 * Handles both IPv4 and IPv6. 3778 * 3779 * NOTE: The caller must hold conn_lock and we drop it here. 3780 */ 3781 int 3782 icmp_output_lastdst(conn_t *connp, mblk_t *mp, cred_t *cr, pid_t pid, 3783 ip_xmit_attr_t *ixa) 3784 { 3785 icmp_t *icmp = connp->conn_icmp; 3786 icmp_stack_t *is = icmp->icmp_is; 3787 int error; 3788 boolean_t do_ipsec; 3789 3790 ASSERT(MUTEX_HELD(&connp->conn_lock)); 3791 ASSERT(ixa != NULL); 3792 3793 ASSERT(cr != NULL); 3794 ASSERT(!(ixa->ixa_free_flags & IXA_FREE_CRED)); 3795 ixa->ixa_cred = cr; 3796 ixa->ixa_cpid = pid; 3797 3798 /* Defer IPsec if it might need to look at ICMP type/code */ 3799 switch (ixa->ixa_protocol) { 3800 case IPPROTO_ICMP: 3801 case IPPROTO_ICMPV6: 3802 do_ipsec = B_FALSE; 3803 break; 3804 default: 3805 do_ipsec = B_TRUE; 3806 } 3807 3808 3809 mp = icmp_prepend_header_template(connp, ixa, mp, 3810 &connp->conn_v6lastsrc, connp->conn_lastflowinfo, &error); 3811 3812 if (mp == NULL) { 3813 ASSERT(error != 0); 3814 mutex_exit(&connp->conn_lock); 3815 ASSERT(!(ixa->ixa_free_flags & IXA_FREE_CRED)); 3816 ixa->ixa_cred = connp->conn_cred; /* Restore */ 3817 ixa->ixa_cpid = connp->conn_cpid; 3818 ixa_refrele(ixa); 3819 BUMP_MIB(&is->is_rawip_mib, rawipOutErrors); 3820 freemsg(mp); 3821 return (error); 3822 } 3823 3824 if (!do_ipsec) { 3825 /* Policy might differ for different ICMP type/code */ 3826 mp = icmp_output_attach_policy(mp, connp, ixa); 3827 if (mp == NULL) { 3828 mutex_exit(&connp->conn_lock); 3829 BUMP_MIB(&is->is_rawip_mib, rawipOutErrors); 3830 ASSERT(!(ixa->ixa_free_flags & IXA_FREE_CRED)); 3831 ixa->ixa_cred = connp->conn_cred; /* Restore */ 3832 ixa->ixa_cpid = connp->conn_cpid; 3833 ixa_refrele(ixa); 3834 return (EHOSTUNREACH); /* IPsec policy failure */ 3835 } 3836 } 3837 3838 /* 3839 * In case we got a safe copy of conn_ixa, or if opt_set made us a new 3840 * safe copy, then we need to fill in any pointers in it. 3841 */ 3842 if (ixa->ixa_ire == NULL) { 3843 in6_addr_t lastdst, lastsrc; 3844 in6_addr_t nexthop; 3845 in_port_t lastport; 3846 3847 lastsrc = connp->conn_v6lastsrc; 3848 lastdst = connp->conn_v6lastdst; 3849 lastport = connp->conn_lastdstport; 3850 ip_attr_nexthop(&connp->conn_xmit_ipp, ixa, &lastdst, &nexthop); 3851 mutex_exit(&connp->conn_lock); 3852 3853 error = ip_attr_connect(connp, ixa, &lastsrc, &lastdst, 3854 &nexthop, lastport, NULL, NULL, IPDF_ALLOW_MCBC | 3855 IPDF_VERIFY_DST | (do_ipsec ? IPDF_IPSEC : 0)); 3856 switch (error) { 3857 case 0: 3858 break; 3859 case EADDRNOTAVAIL: 3860 /* 3861 * IXAF_VERIFY_SOURCE tells us to pick a better source. 3862 * Don't have the application see that errno 3863 */ 3864 error = ENETUNREACH; 3865 goto failed; 3866 case ENETDOWN: 3867 /* 3868 * Have !ipif_addr_ready address; drop packet silently 3869 * until we can get applications to not send until we 3870 * are ready. 3871 */ 3872 error = 0; 3873 goto failed; 3874 case EHOSTUNREACH: 3875 case ENETUNREACH: 3876 if (ixa->ixa_ire != NULL) { 3877 /* 3878 * Let conn_ip_output/ire_send_noroute return 3879 * the error and send any local ICMP error. 3880 */ 3881 error = 0; 3882 break; 3883 } 3884 /* FALLTHRU */ 3885 default: 3886 failed: 3887 ASSERT(!(ixa->ixa_free_flags & IXA_FREE_CRED)); 3888 ixa->ixa_cred = connp->conn_cred; /* Restore */ 3889 ixa->ixa_cpid = connp->conn_cpid; 3890 ixa_refrele(ixa); 3891 BUMP_MIB(&is->is_rawip_mib, rawipOutErrors); 3892 freemsg(mp); 3893 return (error); 3894 } 3895 } else { 3896 /* Done with conn_t */ 3897 mutex_exit(&connp->conn_lock); 3898 } 3899 3900 /* We're done. Pass the packet to ip. */ 3901 BUMP_MIB(&is->is_rawip_mib, rawipOutDatagrams); 3902 error = conn_ip_output(mp, ixa); 3903 /* No rawipOutErrors if an error since IP increases its error counter */ 3904 switch (error) { 3905 case 0: 3906 break; 3907 case EWOULDBLOCK: 3908 (void) ixa_check_drain_insert(connp, ixa); 3909 error = 0; 3910 break; 3911 case EADDRNOTAVAIL: 3912 /* 3913 * IXAF_VERIFY_SOURCE tells us to pick a better source. 3914 * Don't have the application see that errno 3915 */ 3916 error = ENETUNREACH; 3917 /* FALLTHRU */ 3918 default: 3919 mutex_enter(&connp->conn_lock); 3920 /* 3921 * Clear the source and v6lastdst so we call ip_attr_connect 3922 * for the next packet and try to pick a better source. 3923 */ 3924 if (connp->conn_mcbc_bind) 3925 connp->conn_saddr_v6 = ipv6_all_zeros; 3926 else 3927 connp->conn_saddr_v6 = connp->conn_bound_addr_v6; 3928 connp->conn_v6lastdst = ipv6_all_zeros; 3929 mutex_exit(&connp->conn_lock); 3930 break; 3931 } 3932 ASSERT(!(ixa->ixa_free_flags & IXA_FREE_CRED)); 3933 ixa->ixa_cred = connp->conn_cred; /* Restore */ 3934 ixa->ixa_cpid = connp->conn_cpid; 3935 ixa_refrele(ixa); 3936 return (error); 3937 } 3938 3939 3940 /* 3941 * Prepend the header template and then fill in the source and 3942 * flowinfo. The caller needs to handle the destination address since 3943 * it's setting is different if rthdr or source route. 3944 * 3945 * Returns NULL is allocation failed or if the packet would exceed IP_MAXPACKET. 3946 * When it returns NULL it sets errorp. 3947 */ 3948 static mblk_t * 3949 icmp_prepend_header_template(conn_t *connp, ip_xmit_attr_t *ixa, mblk_t *mp, 3950 const in6_addr_t *v6src, uint32_t flowinfo, int *errorp) 3951 { 3952 icmp_t *icmp = connp->conn_icmp; 3953 icmp_stack_t *is = icmp->icmp_is; 3954 uint_t pktlen; 3955 uint_t copylen; 3956 uint8_t *iph; 3957 uint_t ip_hdr_length; 3958 uint32_t cksum; 3959 ip_pkt_t *ipp; 3960 3961 ASSERT(MUTEX_HELD(&connp->conn_lock)); 3962 3963 /* 3964 * Copy the header template. 3965 */ 3966 copylen = connp->conn_ht_iphc_len; 3967 pktlen = copylen + msgdsize(mp); 3968 if (pktlen > IP_MAXPACKET) { 3969 freemsg(mp); 3970 *errorp = EMSGSIZE; 3971 return (NULL); 3972 } 3973 ixa->ixa_pktlen = pktlen; 3974 3975 /* check/fix buffer config, setup pointers into it */ 3976 iph = mp->b_rptr - copylen; 3977 if (DB_REF(mp) != 1 || iph < DB_BASE(mp) || !OK_32PTR(iph)) { 3978 mblk_t *mp1; 3979 3980 mp1 = allocb(copylen + is->is_wroff_extra, BPRI_MED); 3981 if (mp1 == NULL) { 3982 freemsg(mp); 3983 *errorp = ENOMEM; 3984 return (NULL); 3985 } 3986 mp1->b_wptr = DB_LIM(mp1); 3987 mp1->b_cont = mp; 3988 mp = mp1; 3989 iph = (mp->b_wptr - copylen); 3990 } 3991 mp->b_rptr = iph; 3992 bcopy(connp->conn_ht_iphc, iph, copylen); 3993 ip_hdr_length = (uint_t)(connp->conn_ht_ulp - connp->conn_ht_iphc); 3994 3995 ixa->ixa_ip_hdr_length = ip_hdr_length; 3996 3997 /* 3998 * Prepare for ICMPv6 checksum done in IP. 3999 * 4000 * icmp_build_hdr_template has already massaged any routing header 4001 * and placed the result in conn_sum. 4002 * 4003 * We make it easy for IP to include our pseudo header 4004 * by putting our length (and any routing header adjustment) 4005 * in the ICMPv6 checksum field. 4006 */ 4007 cksum = pktlen - ip_hdr_length; 4008 4009 cksum += connp->conn_sum; 4010 cksum = (cksum >> 16) + (cksum & 0xFFFF); 4011 ASSERT(cksum < 0x10000); 4012 4013 ipp = &connp->conn_xmit_ipp; 4014 if (ixa->ixa_flags & IXAF_IS_IPV4) { 4015 ipha_t *ipha = (ipha_t *)iph; 4016 4017 ipha->ipha_length = htons((uint16_t)pktlen); 4018 4019 /* if IP_PKTINFO specified an addres it wins over bind() */ 4020 if ((ipp->ipp_fields & IPPF_ADDR) && 4021 IN6_IS_ADDR_V4MAPPED(&ipp->ipp_addr)) { 4022 ASSERT(ipp->ipp_addr_v4 != INADDR_ANY); 4023 ipha->ipha_src = ipp->ipp_addr_v4; 4024 } else { 4025 IN6_V4MAPPED_TO_IPADDR(v6src, ipha->ipha_src); 4026 } 4027 } else { 4028 ip6_t *ip6h = (ip6_t *)iph; 4029 uint_t cksum_offset = 0; 4030 4031 ip6h->ip6_plen = htons((uint16_t)(pktlen - IPV6_HDR_LEN)); 4032 4033 /* if IP_PKTINFO specified an addres it wins over bind() */ 4034 if ((ipp->ipp_fields & IPPF_ADDR) && 4035 !IN6_IS_ADDR_V4MAPPED(&ipp->ipp_addr)) { 4036 ASSERT(!IN6_IS_ADDR_UNSPECIFIED(&ipp->ipp_addr)); 4037 ip6h->ip6_src = ipp->ipp_addr; 4038 } else { 4039 ip6h->ip6_src = *v6src; 4040 } 4041 ip6h->ip6_vcf = 4042 (IPV6_DEFAULT_VERS_AND_FLOW & IPV6_VERS_AND_FLOW_MASK) | 4043 (flowinfo & ~IPV6_VERS_AND_FLOW_MASK); 4044 if (ipp->ipp_fields & IPPF_TCLASS) { 4045 /* Overrides the class part of flowinfo */ 4046 ip6h->ip6_vcf = IPV6_TCLASS_FLOW(ip6h->ip6_vcf, 4047 ipp->ipp_tclass); 4048 } 4049 4050 if (ixa->ixa_flags & IXAF_SET_ULP_CKSUM) { 4051 if (connp->conn_proto == IPPROTO_ICMPV6) { 4052 cksum_offset = ixa->ixa_ip_hdr_length + 4053 offsetof(icmp6_t, icmp6_cksum); 4054 } else if (ixa->ixa_flags & IXAF_SET_RAW_CKSUM) { 4055 cksum_offset = ixa->ixa_ip_hdr_length + 4056 ixa->ixa_raw_cksum_offset; 4057 } 4058 } 4059 if (cksum_offset != 0) { 4060 uint16_t *ptr; 4061 4062 /* Make sure the checksum fits in the first mblk */ 4063 if (cksum_offset + sizeof (short) > MBLKL(mp)) { 4064 mblk_t *mp1; 4065 4066 mp1 = msgpullup(mp, 4067 cksum_offset + sizeof (short)); 4068 freemsg(mp); 4069 if (mp1 == NULL) { 4070 *errorp = ENOMEM; 4071 return (NULL); 4072 } 4073 mp = mp1; 4074 iph = mp->b_rptr; 4075 ip6h = (ip6_t *)iph; 4076 } 4077 ptr = (uint16_t *)(mp->b_rptr + cksum_offset); 4078 *ptr = htons(cksum); 4079 } 4080 } 4081 4082 return (mp); 4083 } 4084 4085 /* 4086 * This routine handles all messages passed downstream. It either 4087 * consumes the message or passes it downstream; it never queues a 4088 * a message. 4089 */ 4090 void 4091 icmp_wput(queue_t *q, mblk_t *mp) 4092 { 4093 sin6_t *sin6; 4094 sin_t *sin = NULL; 4095 uint_t srcid; 4096 conn_t *connp = Q_TO_CONN(q); 4097 icmp_t *icmp = connp->conn_icmp; 4098 int error = 0; 4099 struct sockaddr *addr = NULL; 4100 socklen_t addrlen; 4101 icmp_stack_t *is = icmp->icmp_is; 4102 struct T_unitdata_req *tudr; 4103 mblk_t *data_mp; 4104 cred_t *cr; 4105 pid_t pid; 4106 4107 /* 4108 * We directly handle several cases here: T_UNITDATA_REQ message 4109 * coming down as M_PROTO/M_PCPROTO and M_DATA messages for connected 4110 * socket. 4111 */ 4112 switch (DB_TYPE(mp)) { 4113 case M_DATA: 4114 /* sockfs never sends down M_DATA */ 4115 BUMP_MIB(&is->is_rawip_mib, rawipOutErrors); 4116 freemsg(mp); 4117 return; 4118 4119 case M_PROTO: 4120 case M_PCPROTO: 4121 tudr = (struct T_unitdata_req *)mp->b_rptr; 4122 if (MBLKL(mp) < sizeof (*tudr) || 4123 ((t_primp_t)mp->b_rptr)->type != T_UNITDATA_REQ) { 4124 icmp_wput_other(q, mp); 4125 return; 4126 } 4127 break; 4128 4129 default: 4130 icmp_wput_other(q, mp); 4131 return; 4132 } 4133 4134 /* Handle valid T_UNITDATA_REQ here */ 4135 data_mp = mp->b_cont; 4136 if (data_mp == NULL) { 4137 error = EPROTO; 4138 goto ud_error2; 4139 } 4140 mp->b_cont = NULL; 4141 4142 if (!MBLKIN(mp, 0, tudr->DEST_offset + tudr->DEST_length)) { 4143 error = EADDRNOTAVAIL; 4144 goto ud_error2; 4145 } 4146 4147 /* 4148 * All Solaris components should pass a db_credp 4149 * for this message, hence we ASSERT. 4150 * On production kernels we return an error to be robust against 4151 * random streams modules sitting on top of us. 4152 */ 4153 cr = msg_getcred(mp, &pid); 4154 ASSERT(cr != NULL); 4155 if (cr == NULL) { 4156 error = EINVAL; 4157 goto ud_error2; 4158 } 4159 4160 /* 4161 * If a port has not been bound to the stream, fail. 4162 * This is not a problem when sockfs is directly 4163 * above us, because it will ensure that the socket 4164 * is first bound before allowing data to be sent. 4165 */ 4166 if (icmp->icmp_state == TS_UNBND) { 4167 error = EPROTO; 4168 goto ud_error2; 4169 } 4170 addr = (struct sockaddr *)&mp->b_rptr[tudr->DEST_offset]; 4171 addrlen = tudr->DEST_length; 4172 4173 switch (connp->conn_family) { 4174 case AF_INET6: 4175 sin6 = (sin6_t *)addr; 4176 if (!OK_32PTR((char *)sin6) || (addrlen != sizeof (sin6_t)) || 4177 (sin6->sin6_family != AF_INET6)) { 4178 error = EADDRNOTAVAIL; 4179 goto ud_error2; 4180 } 4181 4182 /* No support for mapped addresses on raw sockets */ 4183 if (IN6_IS_ADDR_V4MAPPED(&sin6->sin6_addr)) { 4184 error = EADDRNOTAVAIL; 4185 goto ud_error2; 4186 } 4187 srcid = sin6->__sin6_src_id; 4188 4189 /* 4190 * If the local address is a mapped address return 4191 * an error. 4192 * It would be possible to send an IPv6 packet but the 4193 * response would never make it back to the application 4194 * since it is bound to a mapped address. 4195 */ 4196 if (IN6_IS_ADDR_V4MAPPED(&connp->conn_saddr_v6)) { 4197 error = EADDRNOTAVAIL; 4198 goto ud_error2; 4199 } 4200 4201 if (IN6_IS_ADDR_UNSPECIFIED(&sin6->sin6_addr)) 4202 sin6->sin6_addr = ipv6_loopback; 4203 4204 if (tudr->OPT_length != 0) { 4205 /* 4206 * If we are connected then the destination needs to be 4207 * the same as the connected one. 4208 */ 4209 if (icmp->icmp_state == TS_DATA_XFER && 4210 !conn_same_as_last_v6(connp, sin6)) { 4211 error = EISCONN; 4212 goto ud_error2; 4213 } 4214 error = icmp_output_ancillary(connp, NULL, sin6, 4215 data_mp, mp, NULL, cr, pid); 4216 } else { 4217 ip_xmit_attr_t *ixa; 4218 4219 /* 4220 * We have to allocate an ip_xmit_attr_t before we grab 4221 * conn_lock and we need to hold conn_lock once we've 4222 * checked conn_same_as_last_v6 to handle concurrent 4223 * send* calls on a socket. 4224 */ 4225 ixa = conn_get_ixa(connp, B_FALSE); 4226 if (ixa == NULL) { 4227 error = ENOMEM; 4228 goto ud_error2; 4229 } 4230 mutex_enter(&connp->conn_lock); 4231 4232 if (conn_same_as_last_v6(connp, sin6) && 4233 connp->conn_lastsrcid == srcid && 4234 ipsec_outbound_policy_current(ixa)) { 4235 /* icmp_output_lastdst drops conn_lock */ 4236 error = icmp_output_lastdst(connp, data_mp, cr, 4237 pid, ixa); 4238 } else { 4239 /* icmp_output_newdst drops conn_lock */ 4240 error = icmp_output_newdst(connp, data_mp, NULL, 4241 sin6, cr, pid, ixa); 4242 } 4243 ASSERT(MUTEX_NOT_HELD(&connp->conn_lock)); 4244 } 4245 if (error == 0) { 4246 freeb(mp); 4247 return; 4248 } 4249 break; 4250 4251 case AF_INET: 4252 sin = (sin_t *)addr; 4253 if ((!OK_32PTR((char *)sin) || addrlen != sizeof (sin_t)) || 4254 (sin->sin_family != AF_INET)) { 4255 error = EADDRNOTAVAIL; 4256 goto ud_error2; 4257 } 4258 if (sin->sin_addr.s_addr == INADDR_ANY) 4259 sin->sin_addr.s_addr = htonl(INADDR_LOOPBACK); 4260 4261 /* Protocol 255 contains full IP headers */ 4262 /* Read without holding lock */ 4263 if (icmp->icmp_hdrincl) { 4264 if (MBLKL(data_mp) < IP_SIMPLE_HDR_LENGTH) { 4265 if (!pullupmsg(data_mp, IP_SIMPLE_HDR_LENGTH)) { 4266 error = EINVAL; 4267 goto ud_error2; 4268 } 4269 } 4270 error = icmp_output_hdrincl(connp, data_mp, cr, pid); 4271 if (error == 0) { 4272 freeb(mp); 4273 return; 4274 } 4275 /* data_mp consumed above */ 4276 data_mp = NULL; 4277 goto ud_error2; 4278 } 4279 4280 if (tudr->OPT_length != 0) { 4281 /* 4282 * If we are connected then the destination needs to be 4283 * the same as the connected one. 4284 */ 4285 if (icmp->icmp_state == TS_DATA_XFER && 4286 !conn_same_as_last_v4(connp, sin)) { 4287 error = EISCONN; 4288 goto ud_error2; 4289 } 4290 error = icmp_output_ancillary(connp, sin, NULL, 4291 data_mp, mp, NULL, cr, pid); 4292 } else { 4293 ip_xmit_attr_t *ixa; 4294 4295 /* 4296 * We have to allocate an ip_xmit_attr_t before we grab 4297 * conn_lock and we need to hold conn_lock once we've 4298 * checked conn_same_as_last_v4 to handle concurrent 4299 * send* calls on a socket. 4300 */ 4301 ixa = conn_get_ixa(connp, B_FALSE); 4302 if (ixa == NULL) { 4303 error = ENOMEM; 4304 goto ud_error2; 4305 } 4306 mutex_enter(&connp->conn_lock); 4307 4308 if (conn_same_as_last_v4(connp, sin) && 4309 ipsec_outbound_policy_current(ixa)) { 4310 /* icmp_output_lastdst drops conn_lock */ 4311 error = icmp_output_lastdst(connp, data_mp, cr, 4312 pid, ixa); 4313 } else { 4314 /* icmp_output_newdst drops conn_lock */ 4315 error = icmp_output_newdst(connp, data_mp, sin, 4316 NULL, cr, pid, ixa); 4317 } 4318 ASSERT(MUTEX_NOT_HELD(&connp->conn_lock)); 4319 } 4320 if (error == 0) { 4321 freeb(mp); 4322 return; 4323 } 4324 break; 4325 } 4326 ASSERT(mp != NULL); 4327 /* mp is freed by the following routine */ 4328 icmp_ud_err(q, mp, (t_scalar_t)error); 4329 return; 4330 4331 ud_error2: 4332 BUMP_MIB(&is->is_rawip_mib, rawipOutErrors); 4333 freemsg(data_mp); 4334 ASSERT(mp != NULL); 4335 /* mp is freed by the following routine */ 4336 icmp_ud_err(q, mp, (t_scalar_t)error); 4337 } 4338 4339 /* 4340 * Handle the case of the IP address or flow label being different 4341 * for both IPv4 and IPv6. 4342 * 4343 * NOTE: The caller must hold conn_lock and we drop it here. 4344 */ 4345 static int 4346 icmp_output_newdst(conn_t *connp, mblk_t *data_mp, sin_t *sin, sin6_t *sin6, 4347 cred_t *cr, pid_t pid, ip_xmit_attr_t *ixa) 4348 { 4349 icmp_t *icmp = connp->conn_icmp; 4350 icmp_stack_t *is = icmp->icmp_is; 4351 int error; 4352 ip_xmit_attr_t *oldixa; 4353 boolean_t do_ipsec; 4354 uint_t srcid; 4355 uint32_t flowinfo; 4356 in6_addr_t v6src; 4357 in6_addr_t v6dst; 4358 in6_addr_t v6nexthop; 4359 in_port_t dstport; 4360 4361 ASSERT(MUTEX_HELD(&connp->conn_lock)); 4362 ASSERT(ixa != NULL); 4363 4364 /* 4365 * We hold conn_lock across all the use and modifications of 4366 * the conn_lastdst, conn_ixa, and conn_xmit_ipp to ensure that they 4367 * stay consistent. 4368 */ 4369 4370 ASSERT(cr != NULL); 4371 ASSERT(!(ixa->ixa_free_flags & IXA_FREE_CRED)); 4372 ixa->ixa_cred = cr; 4373 ixa->ixa_cpid = pid; 4374 if (is_system_labeled()) { 4375 /* We need to restart with a label based on the cred */ 4376 ip_xmit_attr_restore_tsl(ixa, ixa->ixa_cred); 4377 } 4378 /* 4379 * If we are connected then the destination needs to be the 4380 * same as the connected one, which is not the case here since we 4381 * checked for that above. 4382 */ 4383 if (icmp->icmp_state == TS_DATA_XFER) { 4384 mutex_exit(&connp->conn_lock); 4385 error = EISCONN; 4386 goto ud_error; 4387 } 4388 4389 /* In case previous destination was multicast or multirt */ 4390 ip_attr_newdst(ixa); 4391 4392 /* 4393 * If laddr is unspecified then we look at sin6_src_id. 4394 * We will give precedence to a source address set with IPV6_PKTINFO 4395 * (aka IPPF_ADDR) but that is handled in build_hdrs. However, we don't 4396 * want ip_attr_connect to select a source (since it can fail) when 4397 * IPV6_PKTINFO is specified. 4398 * If this doesn't result in a source address then we get a source 4399 * from ip_attr_connect() below. 4400 */ 4401 v6src = connp->conn_saddr_v6; 4402 if (sin != NULL) { 4403 IN6_IPADDR_TO_V4MAPPED(sin->sin_addr.s_addr, &v6dst); 4404 dstport = sin->sin_port; 4405 flowinfo = 0; 4406 srcid = 0; 4407 ixa->ixa_flags &= ~IXAF_SCOPEID_SET; 4408 if (srcid != 0 && V4_PART_OF_V6(&v6src) == INADDR_ANY) { 4409 ip_srcid_find_id(srcid, &v6src, IPCL_ZONEID(connp), 4410 connp->conn_netstack); 4411 } 4412 ixa->ixa_flags |= IXAF_IS_IPV4; 4413 } else { 4414 v6dst = sin6->sin6_addr; 4415 dstport = sin6->sin6_port; 4416 flowinfo = sin6->sin6_flowinfo; 4417 srcid = sin6->__sin6_src_id; 4418 if (IN6_IS_ADDR_LINKSCOPE(&v6dst) && sin6->sin6_scope_id != 0) { 4419 ixa->ixa_scopeid = sin6->sin6_scope_id; 4420 ixa->ixa_flags |= IXAF_SCOPEID_SET; 4421 } else { 4422 ixa->ixa_flags &= ~IXAF_SCOPEID_SET; 4423 } 4424 if (srcid != 0 && IN6_IS_ADDR_UNSPECIFIED(&v6src)) { 4425 ip_srcid_find_id(srcid, &v6src, IPCL_ZONEID(connp), 4426 connp->conn_netstack); 4427 } 4428 if (IN6_IS_ADDR_V4MAPPED(&v6dst)) 4429 ixa->ixa_flags |= IXAF_IS_IPV4; 4430 else 4431 ixa->ixa_flags &= ~IXAF_IS_IPV4; 4432 } 4433 /* Handle IP_PKTINFO/IPV6_PKTINFO setting source address. */ 4434 if (connp->conn_xmit_ipp.ipp_fields & IPPF_ADDR) { 4435 ip_pkt_t *ipp = &connp->conn_xmit_ipp; 4436 4437 if (ixa->ixa_flags & IXAF_IS_IPV4) { 4438 if (IN6_IS_ADDR_V4MAPPED(&ipp->ipp_addr)) 4439 v6src = ipp->ipp_addr; 4440 } else { 4441 if (!IN6_IS_ADDR_V4MAPPED(&ipp->ipp_addr)) 4442 v6src = ipp->ipp_addr; 4443 } 4444 } 4445 4446 /* Defer IPsec if it might need to look at ICMP type/code */ 4447 switch (ixa->ixa_protocol) { 4448 case IPPROTO_ICMP: 4449 case IPPROTO_ICMPV6: 4450 do_ipsec = B_FALSE; 4451 break; 4452 default: 4453 do_ipsec = B_TRUE; 4454 } 4455 4456 ip_attr_nexthop(&connp->conn_xmit_ipp, ixa, &v6dst, &v6nexthop); 4457 mutex_exit(&connp->conn_lock); 4458 4459 error = ip_attr_connect(connp, ixa, &v6src, &v6dst, &v6nexthop, dstport, 4460 &v6src, NULL, IPDF_ALLOW_MCBC | IPDF_VERIFY_DST | 4461 (do_ipsec ? IPDF_IPSEC : 0)); 4462 switch (error) { 4463 case 0: 4464 break; 4465 case EADDRNOTAVAIL: 4466 /* 4467 * IXAF_VERIFY_SOURCE tells us to pick a better source. 4468 * Don't have the application see that errno 4469 */ 4470 error = ENETUNREACH; 4471 goto failed; 4472 case ENETDOWN: 4473 /* 4474 * Have !ipif_addr_ready address; drop packet silently 4475 * until we can get applications to not send until we 4476 * are ready. 4477 */ 4478 error = 0; 4479 goto failed; 4480 case EHOSTUNREACH: 4481 case ENETUNREACH: 4482 if (ixa->ixa_ire != NULL) { 4483 /* 4484 * Let conn_ip_output/ire_send_noroute return 4485 * the error and send any local ICMP error. 4486 */ 4487 error = 0; 4488 break; 4489 } 4490 /* FALLTHRU */ 4491 default: 4492 failed: 4493 goto ud_error; 4494 } 4495 4496 mutex_enter(&connp->conn_lock); 4497 /* 4498 * While we dropped the lock some other thread might have connected 4499 * this socket. If so we bail out with EISCONN to ensure that the 4500 * connecting thread is the one that updates conn_ixa, conn_ht_* 4501 * and conn_*last*. 4502 */ 4503 if (icmp->icmp_state == TS_DATA_XFER) { 4504 mutex_exit(&connp->conn_lock); 4505 error = EISCONN; 4506 goto ud_error; 4507 } 4508 4509 /* 4510 * We need to rebuild the headers if 4511 * - we are labeling packets (could be different for different 4512 * destinations) 4513 * - we have a source route (or routing header) since we need to 4514 * massage that to get the pseudo-header checksum 4515 * - a socket option with COA_HEADER_CHANGED has been set which 4516 * set conn_v6lastdst to zero. 4517 * 4518 * Otherwise the prepend function will just update the src, dst, 4519 * and flow label. 4520 */ 4521 if (is_system_labeled()) { 4522 /* TX MLP requires SCM_UCRED and don't have that here */ 4523 if (connp->conn_mlp_type != mlptSingle) { 4524 mutex_exit(&connp->conn_lock); 4525 error = ECONNREFUSED; 4526 goto ud_error; 4527 } 4528 /* 4529 * Check whether Trusted Solaris policy allows communication 4530 * with this host, and pretend that the destination is 4531 * unreachable if not. 4532 * Compute any needed label and place it in ipp_label_v4/v6. 4533 * 4534 * Later conn_build_hdr_template/conn_prepend_hdr takes 4535 * ipp_label_v4/v6 to form the packet. 4536 * 4537 * Tsol note: Since we hold conn_lock we know no other 4538 * thread manipulates conn_xmit_ipp. 4539 */ 4540 error = conn_update_label(connp, ixa, &v6dst, 4541 &connp->conn_xmit_ipp); 4542 if (error != 0) { 4543 mutex_exit(&connp->conn_lock); 4544 goto ud_error; 4545 } 4546 /* Rebuild the header template */ 4547 error = icmp_build_hdr_template(connp, &v6src, &v6dst, 4548 flowinfo); 4549 if (error != 0) { 4550 mutex_exit(&connp->conn_lock); 4551 goto ud_error; 4552 } 4553 } else if (connp->conn_xmit_ipp.ipp_fields & 4554 (IPPF_IPV4_OPTIONS|IPPF_RTHDR) || 4555 IN6_IS_ADDR_UNSPECIFIED(&connp->conn_v6lastdst)) { 4556 /* Rebuild the header template */ 4557 error = icmp_build_hdr_template(connp, &v6src, &v6dst, 4558 flowinfo); 4559 if (error != 0) { 4560 mutex_exit(&connp->conn_lock); 4561 goto ud_error; 4562 } 4563 } else { 4564 /* Simply update the destination address if no source route */ 4565 if (ixa->ixa_flags & IXAF_IS_IPV4) { 4566 ipha_t *ipha = (ipha_t *)connp->conn_ht_iphc; 4567 4568 IN6_V4MAPPED_TO_IPADDR(&v6dst, ipha->ipha_dst); 4569 if (ixa->ixa_flags & IXAF_PMTU_IPV4_DF) { 4570 ipha->ipha_fragment_offset_and_flags |= 4571 IPH_DF_HTONS; 4572 } else { 4573 ipha->ipha_fragment_offset_and_flags &= 4574 ~IPH_DF_HTONS; 4575 } 4576 } else { 4577 ip6_t *ip6h = (ip6_t *)connp->conn_ht_iphc; 4578 ip6h->ip6_dst = v6dst; 4579 } 4580 } 4581 4582 /* 4583 * Remember the dst etc which corresponds to the built header 4584 * template and conn_ixa. 4585 */ 4586 oldixa = conn_replace_ixa(connp, ixa); 4587 connp->conn_v6lastdst = v6dst; 4588 connp->conn_lastflowinfo = flowinfo; 4589 connp->conn_lastscopeid = ixa->ixa_scopeid; 4590 connp->conn_lastsrcid = srcid; 4591 /* Also remember a source to use together with lastdst */ 4592 connp->conn_v6lastsrc = v6src; 4593 4594 data_mp = icmp_prepend_header_template(connp, ixa, data_mp, &v6src, 4595 flowinfo, &error); 4596 4597 /* Done with conn_t */ 4598 mutex_exit(&connp->conn_lock); 4599 ixa_refrele(oldixa); 4600 4601 if (data_mp == NULL) { 4602 ASSERT(error != 0); 4603 goto ud_error; 4604 } 4605 4606 if (!do_ipsec) { 4607 /* Policy might differ for different ICMP type/code */ 4608 data_mp = icmp_output_attach_policy(data_mp, connp, ixa); 4609 if (data_mp == NULL) { 4610 BUMP_MIB(&is->is_rawip_mib, rawipOutErrors); 4611 error = EHOSTUNREACH; /* IPsec policy failure */ 4612 goto done; 4613 } 4614 } 4615 4616 /* We're done. Pass the packet to ip. */ 4617 BUMP_MIB(&is->is_rawip_mib, rawipOutDatagrams); 4618 4619 error = conn_ip_output(data_mp, ixa); 4620 /* No rawipOutErrors if an error since IP increases its error counter */ 4621 switch (error) { 4622 case 0: 4623 break; 4624 case EWOULDBLOCK: 4625 (void) ixa_check_drain_insert(connp, ixa); 4626 error = 0; 4627 break; 4628 case EADDRNOTAVAIL: 4629 /* 4630 * IXAF_VERIFY_SOURCE tells us to pick a better source. 4631 * Don't have the application see that errno 4632 */ 4633 error = ENETUNREACH; 4634 /* FALLTHRU */ 4635 default: 4636 mutex_enter(&connp->conn_lock); 4637 /* 4638 * Clear the source and v6lastdst so we call ip_attr_connect 4639 * for the next packet and try to pick a better source. 4640 */ 4641 if (connp->conn_mcbc_bind) 4642 connp->conn_saddr_v6 = ipv6_all_zeros; 4643 else 4644 connp->conn_saddr_v6 = connp->conn_bound_addr_v6; 4645 connp->conn_v6lastdst = ipv6_all_zeros; 4646 mutex_exit(&connp->conn_lock); 4647 break; 4648 } 4649 done: 4650 ASSERT(!(ixa->ixa_free_flags & IXA_FREE_CRED)); 4651 ixa->ixa_cred = connp->conn_cred; /* Restore */ 4652 ixa->ixa_cpid = connp->conn_cpid; 4653 ixa_refrele(ixa); 4654 return (error); 4655 4656 ud_error: 4657 ASSERT(!(ixa->ixa_free_flags & IXA_FREE_CRED)); 4658 ixa->ixa_cred = connp->conn_cred; /* Restore */ 4659 ixa->ixa_cpid = connp->conn_cpid; 4660 ixa_refrele(ixa); 4661 4662 BUMP_MIB(&is->is_rawip_mib, rawipOutErrors); 4663 freemsg(data_mp); 4664 return (error); 4665 } 4666 4667 /* ARGSUSED */ 4668 static void 4669 icmp_wput_fallback(queue_t *q, mblk_t *mp) 4670 { 4671 #ifdef DEBUG 4672 cmn_err(CE_CONT, "icmp_wput_fallback: Message during fallback \n"); 4673 #endif 4674 freemsg(mp); 4675 } 4676 4677 static void 4678 icmp_wput_other(queue_t *q, mblk_t *mp) 4679 { 4680 uchar_t *rptr = mp->b_rptr; 4681 struct iocblk *iocp; 4682 conn_t *connp = Q_TO_CONN(q); 4683 icmp_t *icmp = connp->conn_icmp; 4684 cred_t *cr; 4685 4686 switch (mp->b_datap->db_type) { 4687 case M_PROTO: 4688 case M_PCPROTO: 4689 if (mp->b_wptr - rptr < sizeof (t_scalar_t)) { 4690 /* 4691 * If the message does not contain a PRIM_type, 4692 * throw it away. 4693 */ 4694 freemsg(mp); 4695 return; 4696 } 4697 switch (((t_primp_t)rptr)->type) { 4698 case T_ADDR_REQ: 4699 icmp_addr_req(q, mp); 4700 return; 4701 case O_T_BIND_REQ: 4702 case T_BIND_REQ: 4703 icmp_tpi_bind(q, mp); 4704 return; 4705 case T_CONN_REQ: 4706 icmp_tpi_connect(q, mp); 4707 return; 4708 case T_CAPABILITY_REQ: 4709 icmp_capability_req(q, mp); 4710 return; 4711 case T_INFO_REQ: 4712 icmp_info_req(q, mp); 4713 return; 4714 case T_UNITDATA_REQ: 4715 /* 4716 * If a T_UNITDATA_REQ gets here, the address must 4717 * be bad. Valid T_UNITDATA_REQs are handled 4718 * in icmp_wput. 4719 */ 4720 icmp_ud_err(q, mp, EADDRNOTAVAIL); 4721 return; 4722 case T_UNBIND_REQ: 4723 icmp_tpi_unbind(q, mp); 4724 return; 4725 case T_SVR4_OPTMGMT_REQ: 4726 /* 4727 * All Solaris components should pass a db_credp 4728 * for this TPI message, hence we ASSERT. 4729 * But in case there is some other M_PROTO that looks 4730 * like a TPI message sent by some other kernel 4731 * component, we check and return an error. 4732 */ 4733 cr = msg_getcred(mp, NULL); 4734 ASSERT(cr != NULL); 4735 if (cr == NULL) { 4736 icmp_err_ack(q, mp, TSYSERR, EINVAL); 4737 return; 4738 } 4739 4740 if (!snmpcom_req(q, mp, icmp_snmp_set, ip_snmp_get, 4741 cr)) { 4742 svr4_optcom_req(q, mp, cr, &icmp_opt_obj); 4743 } 4744 return; 4745 4746 case T_OPTMGMT_REQ: 4747 /* 4748 * All Solaris components should pass a db_credp 4749 * for this TPI message, hence we ASSERT. 4750 * But in case there is some other M_PROTO that looks 4751 * like a TPI message sent by some other kernel 4752 * component, we check and return an error. 4753 */ 4754 cr = msg_getcred(mp, NULL); 4755 ASSERT(cr != NULL); 4756 if (cr == NULL) { 4757 icmp_err_ack(q, mp, TSYSERR, EINVAL); 4758 return; 4759 } 4760 tpi_optcom_req(q, mp, cr, &icmp_opt_obj); 4761 return; 4762 4763 case T_DISCON_REQ: 4764 icmp_tpi_disconnect(q, mp); 4765 return; 4766 4767 /* The following TPI message is not supported by icmp. */ 4768 case O_T_CONN_RES: 4769 case T_CONN_RES: 4770 icmp_err_ack(q, mp, TNOTSUPPORT, 0); 4771 return; 4772 4773 /* The following 3 TPI requests are illegal for icmp. */ 4774 case T_DATA_REQ: 4775 case T_EXDATA_REQ: 4776 case T_ORDREL_REQ: 4777 icmp_err_ack(q, mp, TNOTSUPPORT, 0); 4778 return; 4779 default: 4780 break; 4781 } 4782 break; 4783 case M_FLUSH: 4784 if (*rptr & FLUSHW) 4785 flushq(q, FLUSHDATA); 4786 break; 4787 case M_IOCTL: 4788 iocp = (struct iocblk *)mp->b_rptr; 4789 switch (iocp->ioc_cmd) { 4790 case TI_GETPEERNAME: 4791 if (icmp->icmp_state != TS_DATA_XFER) { 4792 /* 4793 * If a default destination address has not 4794 * been associated with the stream, then we 4795 * don't know the peer's name. 4796 */ 4797 iocp->ioc_error = ENOTCONN; 4798 iocp->ioc_count = 0; 4799 mp->b_datap->db_type = M_IOCACK; 4800 qreply(q, mp); 4801 return; 4802 } 4803 /* FALLTHRU */ 4804 case TI_GETMYNAME: 4805 /* 4806 * For TI_GETPEERNAME and TI_GETMYNAME, we first 4807 * need to copyin the user's strbuf structure. 4808 * Processing will continue in the M_IOCDATA case 4809 * below. 4810 */ 4811 mi_copyin(q, mp, NULL, 4812 SIZEOF_STRUCT(strbuf, iocp->ioc_flag)); 4813 return; 4814 default: 4815 break; 4816 } 4817 break; 4818 case M_IOCDATA: 4819 icmp_wput_iocdata(q, mp); 4820 return; 4821 default: 4822 /* Unrecognized messages are passed through without change. */ 4823 break; 4824 } 4825 ip_wput_nondata(q, mp); 4826 } 4827 4828 /* 4829 * icmp_wput_iocdata is called by icmp_wput_other to handle all M_IOCDATA 4830 * messages. 4831 */ 4832 static void 4833 icmp_wput_iocdata(queue_t *q, mblk_t *mp) 4834 { 4835 mblk_t *mp1; 4836 STRUCT_HANDLE(strbuf, sb); 4837 uint_t addrlen; 4838 conn_t *connp = Q_TO_CONN(q); 4839 icmp_t *icmp = connp->conn_icmp; 4840 4841 /* Make sure it is one of ours. */ 4842 switch (((struct iocblk *)mp->b_rptr)->ioc_cmd) { 4843 case TI_GETMYNAME: 4844 case TI_GETPEERNAME: 4845 break; 4846 default: 4847 ip_wput_nondata(q, mp); 4848 return; 4849 } 4850 4851 switch (mi_copy_state(q, mp, &mp1)) { 4852 case -1: 4853 return; 4854 case MI_COPY_CASE(MI_COPY_IN, 1): 4855 break; 4856 case MI_COPY_CASE(MI_COPY_OUT, 1): 4857 /* 4858 * The address has been copied out, so now 4859 * copyout the strbuf. 4860 */ 4861 mi_copyout(q, mp); 4862 return; 4863 case MI_COPY_CASE(MI_COPY_OUT, 2): 4864 /* 4865 * The address and strbuf have been copied out. 4866 * We're done, so just acknowledge the original 4867 * M_IOCTL. 4868 */ 4869 mi_copy_done(q, mp, 0); 4870 return; 4871 default: 4872 /* 4873 * Something strange has happened, so acknowledge 4874 * the original M_IOCTL with an EPROTO error. 4875 */ 4876 mi_copy_done(q, mp, EPROTO); 4877 return; 4878 } 4879 4880 /* 4881 * Now we have the strbuf structure for TI_GETMYNAME 4882 * and TI_GETPEERNAME. Next we copyout the requested 4883 * address and then we'll copyout the strbuf. 4884 */ 4885 STRUCT_SET_HANDLE(sb, ((struct iocblk *)mp->b_rptr)->ioc_flag, 4886 (void *)mp1->b_rptr); 4887 4888 if (connp->conn_family == AF_INET) 4889 addrlen = sizeof (sin_t); 4890 else 4891 addrlen = sizeof (sin6_t); 4892 4893 if (STRUCT_FGET(sb, maxlen) < addrlen) { 4894 mi_copy_done(q, mp, EINVAL); 4895 return; 4896 } 4897 switch (((struct iocblk *)mp->b_rptr)->ioc_cmd) { 4898 case TI_GETMYNAME: 4899 break; 4900 case TI_GETPEERNAME: 4901 if (icmp->icmp_state != TS_DATA_XFER) { 4902 mi_copy_done(q, mp, ENOTCONN); 4903 return; 4904 } 4905 break; 4906 default: 4907 mi_copy_done(q, mp, EPROTO); 4908 return; 4909 } 4910 mp1 = mi_copyout_alloc(q, mp, STRUCT_FGETP(sb, buf), addrlen, B_TRUE); 4911 if (!mp1) 4912 return; 4913 4914 STRUCT_FSET(sb, len, addrlen); 4915 switch (((struct iocblk *)mp->b_rptr)->ioc_cmd) { 4916 case TI_GETMYNAME: 4917 (void) conn_getsockname(connp, (struct sockaddr *)mp1->b_wptr, 4918 &addrlen); 4919 break; 4920 case TI_GETPEERNAME: 4921 (void) conn_getpeername(connp, (struct sockaddr *)mp1->b_wptr, 4922 &addrlen); 4923 break; 4924 } 4925 mp1->b_wptr += addrlen; 4926 /* Copy out the address */ 4927 mi_copyout(q, mp); 4928 } 4929 4930 void 4931 icmp_ddi_g_init(void) 4932 { 4933 icmp_max_optsize = optcom_max_optsize(icmp_opt_obj.odb_opt_des_arr, 4934 icmp_opt_obj.odb_opt_arr_cnt); 4935 4936 /* 4937 * We want to be informed each time a stack is created or 4938 * destroyed in the kernel, so we can maintain the 4939 * set of icmp_stack_t's. 4940 */ 4941 netstack_register(NS_ICMP, rawip_stack_init, NULL, rawip_stack_fini); 4942 } 4943 4944 void 4945 icmp_ddi_g_destroy(void) 4946 { 4947 netstack_unregister(NS_ICMP); 4948 } 4949 4950 #define INET_NAME "ip" 4951 4952 /* 4953 * Initialize the ICMP stack instance. 4954 */ 4955 static void * 4956 rawip_stack_init(netstackid_t stackid, netstack_t *ns) 4957 { 4958 icmp_stack_t *is; 4959 int error = 0; 4960 size_t arrsz; 4961 major_t major; 4962 4963 is = (icmp_stack_t *)kmem_zalloc(sizeof (*is), KM_SLEEP); 4964 is->is_netstack = ns; 4965 4966 arrsz = sizeof (icmp_propinfo_tbl); 4967 is->is_propinfo_tbl = (mod_prop_info_t *)kmem_alloc(arrsz, KM_SLEEP); 4968 bcopy(icmp_propinfo_tbl, is->is_propinfo_tbl, arrsz); 4969 4970 is->is_ksp = rawip_kstat_init(stackid); 4971 4972 major = mod_name_to_major(INET_NAME); 4973 error = ldi_ident_from_major(major, &is->is_ldi_ident); 4974 ASSERT(error == 0); 4975 return (is); 4976 } 4977 4978 /* 4979 * Free the ICMP stack instance. 4980 */ 4981 static void 4982 rawip_stack_fini(netstackid_t stackid, void *arg) 4983 { 4984 icmp_stack_t *is = (icmp_stack_t *)arg; 4985 4986 kmem_free(is->is_propinfo_tbl, sizeof (icmp_propinfo_tbl)); 4987 is->is_propinfo_tbl = NULL; 4988 4989 rawip_kstat_fini(stackid, is->is_ksp); 4990 is->is_ksp = NULL; 4991 ldi_ident_release(is->is_ldi_ident); 4992 kmem_free(is, sizeof (*is)); 4993 } 4994 4995 static void * 4996 rawip_kstat_init(netstackid_t stackid) { 4997 kstat_t *ksp; 4998 4999 rawip_named_kstat_t template = { 5000 { "inDatagrams", KSTAT_DATA_UINT32, 0 }, 5001 { "inCksumErrs", KSTAT_DATA_UINT32, 0 }, 5002 { "inErrors", KSTAT_DATA_UINT32, 0 }, 5003 { "outDatagrams", KSTAT_DATA_UINT32, 0 }, 5004 { "outErrors", KSTAT_DATA_UINT32, 0 }, 5005 }; 5006 5007 ksp = kstat_create_netstack("icmp", 0, "rawip", "mib2", 5008 KSTAT_TYPE_NAMED, 5009 NUM_OF_FIELDS(rawip_named_kstat_t), 5010 0, stackid); 5011 if (ksp == NULL || ksp->ks_data == NULL) 5012 return (NULL); 5013 5014 bcopy(&template, ksp->ks_data, sizeof (template)); 5015 ksp->ks_update = rawip_kstat_update; 5016 ksp->ks_private = (void *)(uintptr_t)stackid; 5017 5018 kstat_install(ksp); 5019 return (ksp); 5020 } 5021 5022 static void 5023 rawip_kstat_fini(netstackid_t stackid, kstat_t *ksp) 5024 { 5025 if (ksp != NULL) { 5026 ASSERT(stackid == (netstackid_t)(uintptr_t)ksp->ks_private); 5027 kstat_delete_netstack(ksp, stackid); 5028 } 5029 } 5030 5031 static int 5032 rawip_kstat_update(kstat_t *ksp, int rw) 5033 { 5034 rawip_named_kstat_t *rawipkp; 5035 netstackid_t stackid = (netstackid_t)(uintptr_t)ksp->ks_private; 5036 netstack_t *ns; 5037 icmp_stack_t *is; 5038 5039 if ((ksp == NULL) || (ksp->ks_data == NULL)) 5040 return (EIO); 5041 5042 if (rw == KSTAT_WRITE) 5043 return (EACCES); 5044 5045 rawipkp = (rawip_named_kstat_t *)ksp->ks_data; 5046 5047 ns = netstack_find_by_stackid(stackid); 5048 if (ns == NULL) 5049 return (-1); 5050 is = ns->netstack_icmp; 5051 if (is == NULL) { 5052 netstack_rele(ns); 5053 return (-1); 5054 } 5055 rawipkp->inDatagrams.value.ui32 = is->is_rawip_mib.rawipInDatagrams; 5056 rawipkp->inCksumErrs.value.ui32 = is->is_rawip_mib.rawipInCksumErrs; 5057 rawipkp->inErrors.value.ui32 = is->is_rawip_mib.rawipInErrors; 5058 rawipkp->outDatagrams.value.ui32 = is->is_rawip_mib.rawipOutDatagrams; 5059 rawipkp->outErrors.value.ui32 = is->is_rawip_mib.rawipOutErrors; 5060 netstack_rele(ns); 5061 return (0); 5062 } 5063 5064 /* ARGSUSED */ 5065 int 5066 rawip_accept(sock_lower_handle_t lproto_handle, 5067 sock_lower_handle_t eproto_handle, sock_upper_handle_t sock_handle, 5068 cred_t *cr) 5069 { 5070 return (EOPNOTSUPP); 5071 } 5072 5073 /* ARGSUSED */ 5074 int 5075 rawip_bind(sock_lower_handle_t proto_handle, struct sockaddr *sa, 5076 socklen_t len, cred_t *cr) 5077 { 5078 conn_t *connp = (conn_t *)proto_handle; 5079 int error; 5080 5081 /* All Solaris components should pass a cred for this operation. */ 5082 ASSERT(cr != NULL); 5083 5084 /* Binding to a NULL address really means unbind */ 5085 if (sa == NULL) 5086 error = rawip_do_unbind(connp); 5087 else 5088 error = rawip_do_bind(connp, sa, len); 5089 5090 if (error < 0) { 5091 if (error == -TOUTSTATE) 5092 error = EINVAL; 5093 else 5094 error = proto_tlitosyserr(-error); 5095 } 5096 return (error); 5097 } 5098 5099 static int 5100 rawip_implicit_bind(conn_t *connp) 5101 { 5102 sin6_t sin6addr; 5103 sin_t *sin; 5104 sin6_t *sin6; 5105 socklen_t len; 5106 int error; 5107 5108 if (connp->conn_family == AF_INET) { 5109 len = sizeof (struct sockaddr_in); 5110 sin = (sin_t *)&sin6addr; 5111 *sin = sin_null; 5112 sin->sin_family = AF_INET; 5113 sin->sin_addr.s_addr = INADDR_ANY; 5114 } else { 5115 ASSERT(connp->conn_family == AF_INET6); 5116 len = sizeof (sin6_t); 5117 sin6 = (sin6_t *)&sin6addr; 5118 *sin6 = sin6_null; 5119 sin6->sin6_family = AF_INET6; 5120 V6_SET_ZERO(sin6->sin6_addr); 5121 } 5122 5123 error = rawip_do_bind(connp, (struct sockaddr *)&sin6addr, len); 5124 5125 return ((error < 0) ? proto_tlitosyserr(-error) : error); 5126 } 5127 5128 static int 5129 rawip_unbind(conn_t *connp) 5130 { 5131 int error; 5132 5133 error = rawip_do_unbind(connp); 5134 if (error < 0) { 5135 error = proto_tlitosyserr(-error); 5136 } 5137 return (error); 5138 } 5139 5140 /* ARGSUSED */ 5141 int 5142 rawip_listen(sock_lower_handle_t proto_handle, int backlog, cred_t *cr) 5143 { 5144 return (EOPNOTSUPP); 5145 } 5146 5147 int 5148 rawip_connect(sock_lower_handle_t proto_handle, const struct sockaddr *sa, 5149 socklen_t len, sock_connid_t *id, cred_t *cr) 5150 { 5151 conn_t *connp = (conn_t *)proto_handle; 5152 icmp_t *icmp = connp->conn_icmp; 5153 int error; 5154 boolean_t did_bind = B_FALSE; 5155 pid_t pid = curproc->p_pid; 5156 5157 /* All Solaris components should pass a cred for this operation. */ 5158 ASSERT(cr != NULL); 5159 5160 if (sa == NULL) { 5161 /* 5162 * Disconnect 5163 * Make sure we are connected 5164 */ 5165 if (icmp->icmp_state != TS_DATA_XFER) 5166 return (EINVAL); 5167 5168 error = icmp_disconnect(connp); 5169 return (error); 5170 } 5171 5172 error = proto_verify_ip_addr(connp->conn_family, sa, len); 5173 if (error != 0) 5174 return (error); 5175 5176 /* do an implicit bind if necessary */ 5177 if (icmp->icmp_state == TS_UNBND) { 5178 error = rawip_implicit_bind(connp); 5179 /* 5180 * We could be racing with an actual bind, in which case 5181 * we would see EPROTO. We cross our fingers and try 5182 * to connect. 5183 */ 5184 if (!(error == 0 || error == EPROTO)) 5185 return (error); 5186 did_bind = B_TRUE; 5187 } 5188 5189 /* 5190 * set SO_DGRAM_ERRIND 5191 */ 5192 connp->conn_dgram_errind = B_TRUE; 5193 5194 error = rawip_do_connect(connp, sa, len, cr, pid); 5195 if (error != 0 && did_bind) { 5196 int unbind_err; 5197 5198 unbind_err = rawip_unbind(connp); 5199 ASSERT(unbind_err == 0); 5200 } 5201 5202 if (error == 0) { 5203 *id = 0; 5204 (*connp->conn_upcalls->su_connected)(connp->conn_upper_handle, 5205 0, NULL, -1); 5206 } else if (error < 0) { 5207 error = proto_tlitosyserr(-error); 5208 } 5209 return (error); 5210 } 5211 5212 /* ARGSUSED2 */ 5213 int 5214 rawip_fallback(sock_lower_handle_t proto_handle, queue_t *q, 5215 boolean_t direct_sockfs, so_proto_quiesced_cb_t quiesced_cb, 5216 sock_quiesce_arg_t *arg) 5217 { 5218 conn_t *connp = (conn_t *)proto_handle; 5219 icmp_t *icmp; 5220 struct T_capability_ack tca; 5221 struct sockaddr_in6 laddr, faddr; 5222 socklen_t laddrlen, faddrlen; 5223 short opts; 5224 struct stroptions *stropt; 5225 mblk_t *mp, *stropt_mp; 5226 int error; 5227 5228 icmp = connp->conn_icmp; 5229 5230 stropt_mp = allocb_wait(sizeof (*stropt), BPRI_HI, STR_NOSIG, NULL); 5231 5232 /* 5233 * setup the fallback stream that was allocated 5234 */ 5235 connp->conn_dev = (dev_t)RD(q)->q_ptr; 5236 connp->conn_minor_arena = WR(q)->q_ptr; 5237 5238 RD(q)->q_ptr = WR(q)->q_ptr = connp; 5239 5240 WR(q)->q_qinfo = &icmpwinit; 5241 5242 connp->conn_rq = RD(q); 5243 connp->conn_wq = WR(q); 5244 5245 /* Notify stream head about options before sending up data */ 5246 stropt_mp->b_datap->db_type = M_SETOPTS; 5247 stropt_mp->b_wptr += sizeof (*stropt); 5248 stropt = (struct stroptions *)stropt_mp->b_rptr; 5249 stropt->so_flags = SO_WROFF | SO_HIWAT; 5250 stropt->so_wroff = connp->conn_wroff; 5251 stropt->so_hiwat = connp->conn_rcvbuf; 5252 putnext(RD(q), stropt_mp); 5253 5254 /* 5255 * free helper stream 5256 */ 5257 ip_free_helper_stream(connp); 5258 5259 /* 5260 * Collect the information needed to sync with the sonode 5261 */ 5262 icmp_do_capability_ack(icmp, &tca, TC1_INFO); 5263 5264 laddrlen = faddrlen = sizeof (sin6_t); 5265 (void) rawip_getsockname((sock_lower_handle_t)connp, 5266 (struct sockaddr *)&laddr, &laddrlen, CRED()); 5267 error = rawip_getpeername((sock_lower_handle_t)connp, 5268 (struct sockaddr *)&faddr, &faddrlen, CRED()); 5269 if (error != 0) 5270 faddrlen = 0; 5271 opts = 0; 5272 if (connp->conn_dgram_errind) 5273 opts |= SO_DGRAM_ERRIND; 5274 if (connp->conn_ixa->ixa_flags & IXAF_DONTROUTE) 5275 opts |= SO_DONTROUTE; 5276 5277 mp = (*quiesced_cb)(connp->conn_upper_handle, arg, &tca, 5278 (struct sockaddr *)&laddr, laddrlen, 5279 (struct sockaddr *)&faddr, faddrlen, opts); 5280 5281 /* 5282 * Attempts to send data up during fallback will result in it being 5283 * queued in icmp_t. Now we push up any queued packets. 5284 */ 5285 mutex_enter(&icmp->icmp_recv_lock); 5286 if (mp != NULL) { 5287 mp->b_next = icmp->icmp_fallback_queue_head; 5288 icmp->icmp_fallback_queue_head = mp; 5289 } 5290 while (icmp->icmp_fallback_queue_head != NULL) { 5291 mp = icmp->icmp_fallback_queue_head; 5292 icmp->icmp_fallback_queue_head = mp->b_next; 5293 mp->b_next = NULL; 5294 mutex_exit(&icmp->icmp_recv_lock); 5295 putnext(RD(q), mp); 5296 mutex_enter(&icmp->icmp_recv_lock); 5297 } 5298 icmp->icmp_fallback_queue_tail = icmp->icmp_fallback_queue_head; 5299 5300 /* 5301 * No longer a streams less socket 5302 */ 5303 mutex_enter(&connp->conn_lock); 5304 connp->conn_flags &= ~IPCL_NONSTR; 5305 mutex_exit(&connp->conn_lock); 5306 5307 mutex_exit(&icmp->icmp_recv_lock); 5308 5309 ASSERT(icmp->icmp_fallback_queue_head == NULL && 5310 icmp->icmp_fallback_queue_tail == NULL); 5311 5312 ASSERT(connp->conn_ref >= 1); 5313 5314 return (0); 5315 } 5316 5317 /* ARGSUSED2 */ 5318 sock_lower_handle_t 5319 rawip_create(int family, int type, int proto, sock_downcalls_t **sock_downcalls, 5320 uint_t *smodep, int *errorp, int flags, cred_t *credp) 5321 { 5322 conn_t *connp; 5323 5324 if (type != SOCK_RAW || (family != AF_INET && family != AF_INET6)) { 5325 *errorp = EPROTONOSUPPORT; 5326 return (NULL); 5327 } 5328 5329 connp = rawip_do_open(family, credp, errorp, flags); 5330 if (connp != NULL) { 5331 connp->conn_flags |= IPCL_NONSTR; 5332 5333 mutex_enter(&connp->conn_lock); 5334 connp->conn_state_flags &= ~CONN_INCIPIENT; 5335 mutex_exit(&connp->conn_lock); 5336 *sock_downcalls = &sock_rawip_downcalls; 5337 *smodep = SM_ATOMIC; 5338 } else { 5339 ASSERT(*errorp != 0); 5340 } 5341 5342 return ((sock_lower_handle_t)connp); 5343 } 5344 5345 /* ARGSUSED3 */ 5346 void 5347 rawip_activate(sock_lower_handle_t proto_handle, 5348 sock_upper_handle_t sock_handle, sock_upcalls_t *sock_upcalls, int flags, 5349 cred_t *cr) 5350 { 5351 conn_t *connp = (conn_t *)proto_handle; 5352 struct sock_proto_props sopp; 5353 5354 /* All Solaris components should pass a cred for this operation. */ 5355 ASSERT(cr != NULL); 5356 5357 connp->conn_upcalls = sock_upcalls; 5358 connp->conn_upper_handle = sock_handle; 5359 5360 sopp.sopp_flags = SOCKOPT_WROFF | SOCKOPT_RCVHIWAT | SOCKOPT_RCVLOWAT | 5361 SOCKOPT_MAXBLK | SOCKOPT_MAXPSZ | SOCKOPT_MINPSZ; 5362 sopp.sopp_wroff = connp->conn_wroff; 5363 sopp.sopp_rxhiwat = connp->conn_rcvbuf; 5364 sopp.sopp_rxlowat = connp->conn_rcvlowat; 5365 sopp.sopp_maxblk = INFPSZ; 5366 sopp.sopp_maxpsz = IP_MAXPACKET; 5367 sopp.sopp_minpsz = (icmp_mod_info.mi_minpsz == 1) ? 0 : 5368 icmp_mod_info.mi_minpsz; 5369 5370 (*connp->conn_upcalls->su_set_proto_props) 5371 (connp->conn_upper_handle, &sopp); 5372 5373 icmp_bind_proto(connp->conn_icmp); 5374 } 5375 5376 /* ARGSUSED3 */ 5377 int 5378 rawip_getpeername(sock_lower_handle_t proto_handle, struct sockaddr *sa, 5379 socklen_t *salenp, cred_t *cr) 5380 { 5381 conn_t *connp = (conn_t *)proto_handle; 5382 icmp_t *icmp = connp->conn_icmp; 5383 int error; 5384 5385 /* All Solaris components should pass a cred for this operation. */ 5386 ASSERT(cr != NULL); 5387 5388 mutex_enter(&connp->conn_lock); 5389 if (icmp->icmp_state != TS_DATA_XFER) 5390 error = ENOTCONN; 5391 else 5392 error = conn_getpeername(connp, sa, salenp); 5393 mutex_exit(&connp->conn_lock); 5394 return (error); 5395 } 5396 5397 /* ARGSUSED3 */ 5398 int 5399 rawip_getsockname(sock_lower_handle_t proto_handle, struct sockaddr *sa, 5400 socklen_t *salenp, cred_t *cr) 5401 { 5402 conn_t *connp = (conn_t *)proto_handle; 5403 int error; 5404 5405 /* All Solaris components should pass a cred for this operation. */ 5406 ASSERT(cr != NULL); 5407 5408 mutex_enter(&connp->conn_lock); 5409 error = conn_getsockname(connp, sa, salenp); 5410 mutex_exit(&connp->conn_lock); 5411 return (error); 5412 } 5413 5414 int 5415 rawip_setsockopt(sock_lower_handle_t proto_handle, int level, int option_name, 5416 const void *optvalp, socklen_t optlen, cred_t *cr) 5417 { 5418 conn_t *connp = (conn_t *)proto_handle; 5419 int error; 5420 5421 /* All Solaris components should pass a cred for this operation. */ 5422 ASSERT(cr != NULL); 5423 5424 error = proto_opt_check(level, option_name, optlen, NULL, 5425 icmp_opt_obj.odb_opt_des_arr, 5426 icmp_opt_obj.odb_opt_arr_cnt, 5427 B_TRUE, B_FALSE, cr); 5428 5429 if (error != 0) { 5430 /* 5431 * option not recognized 5432 */ 5433 if (error < 0) { 5434 error = proto_tlitosyserr(-error); 5435 } 5436 return (error); 5437 } 5438 5439 error = icmp_opt_set(connp, SETFN_OPTCOM_NEGOTIATE, level, 5440 option_name, optlen, (uchar_t *)optvalp, (uint_t *)&optlen, 5441 (uchar_t *)optvalp, NULL, cr); 5442 5443 ASSERT(error >= 0); 5444 5445 return (error); 5446 } 5447 5448 int 5449 rawip_getsockopt(sock_lower_handle_t proto_handle, int level, int option_name, 5450 void *optvalp, socklen_t *optlen, cred_t *cr) 5451 { 5452 int error; 5453 conn_t *connp = (conn_t *)proto_handle; 5454 t_uscalar_t max_optbuf_len; 5455 void *optvalp_buf; 5456 int len; 5457 5458 /* All Solaris components should pass a cred for this operation. */ 5459 ASSERT(cr != NULL); 5460 5461 error = proto_opt_check(level, option_name, *optlen, &max_optbuf_len, 5462 icmp_opt_obj.odb_opt_des_arr, 5463 icmp_opt_obj.odb_opt_arr_cnt, 5464 B_FALSE, B_TRUE, cr); 5465 5466 if (error != 0) { 5467 if (error < 0) { 5468 error = proto_tlitosyserr(-error); 5469 } 5470 return (error); 5471 } 5472 5473 optvalp_buf = kmem_alloc(max_optbuf_len, KM_SLEEP); 5474 len = icmp_opt_get(connp, level, option_name, optvalp_buf); 5475 if (len == -1) { 5476 kmem_free(optvalp_buf, max_optbuf_len); 5477 return (EINVAL); 5478 } 5479 5480 /* 5481 * update optlen and copy option value 5482 */ 5483 t_uscalar_t size = MIN(len, *optlen); 5484 5485 bcopy(optvalp_buf, optvalp, size); 5486 bcopy(&size, optlen, sizeof (size)); 5487 5488 kmem_free(optvalp_buf, max_optbuf_len); 5489 return (0); 5490 } 5491 5492 /* ARGSUSED1 */ 5493 int 5494 rawip_close(sock_lower_handle_t proto_handle, int flags, cred_t *cr) 5495 { 5496 conn_t *connp = (conn_t *)proto_handle; 5497 5498 /* All Solaris components should pass a cred for this operation. */ 5499 ASSERT(cr != NULL); 5500 5501 (void) rawip_do_close(connp); 5502 return (0); 5503 } 5504 5505 /* ARGSUSED2 */ 5506 int 5507 rawip_shutdown(sock_lower_handle_t proto_handle, int how, cred_t *cr) 5508 { 5509 conn_t *connp = (conn_t *)proto_handle; 5510 5511 /* All Solaris components should pass a cred for this operation. */ 5512 ASSERT(cr != NULL); 5513 5514 /* shut down the send side */ 5515 if (how != SHUT_RD) 5516 (*connp->conn_upcalls->su_opctl)(connp->conn_upper_handle, 5517 SOCK_OPCTL_SHUT_SEND, 0); 5518 /* shut down the recv side */ 5519 if (how != SHUT_WR) 5520 (*connp->conn_upcalls->su_opctl)(connp->conn_upper_handle, 5521 SOCK_OPCTL_SHUT_RECV, 0); 5522 return (0); 5523 } 5524 5525 void 5526 rawip_clr_flowctrl(sock_lower_handle_t proto_handle) 5527 { 5528 conn_t *connp = (conn_t *)proto_handle; 5529 icmp_t *icmp = connp->conn_icmp; 5530 5531 mutex_enter(&icmp->icmp_recv_lock); 5532 connp->conn_flow_cntrld = B_FALSE; 5533 mutex_exit(&icmp->icmp_recv_lock); 5534 } 5535 5536 int 5537 rawip_ioctl(sock_lower_handle_t proto_handle, int cmd, intptr_t arg, 5538 int mode, int32_t *rvalp, cred_t *cr) 5539 { 5540 conn_t *connp = (conn_t *)proto_handle; 5541 int error; 5542 5543 /* All Solaris components should pass a cred for this operation. */ 5544 ASSERT(cr != NULL); 5545 5546 /* 5547 * If we don't have a helper stream then create one. 5548 * ip_create_helper_stream takes care of locking the conn_t, 5549 * so this check for NULL is just a performance optimization. 5550 */ 5551 if (connp->conn_helper_info == NULL) { 5552 icmp_stack_t *is = connp->conn_icmp->icmp_is; 5553 5554 ASSERT(is->is_ldi_ident != NULL); 5555 5556 /* 5557 * Create a helper stream for non-STREAMS socket. 5558 */ 5559 error = ip_create_helper_stream(connp, is->is_ldi_ident); 5560 if (error != 0) { 5561 ip0dbg(("rawip_ioctl: create of IP helper stream " 5562 "failed %d\n", error)); 5563 return (error); 5564 } 5565 } 5566 5567 switch (cmd) { 5568 case _SIOCSOCKFALLBACK: 5569 case TI_GETPEERNAME: 5570 case TI_GETMYNAME: 5571 #ifdef DEBUG 5572 cmn_err(CE_CONT, "icmp_ioctl cmd 0x%x on non streams" 5573 " socket", cmd); 5574 #endif 5575 error = EINVAL; 5576 break; 5577 default: 5578 /* 5579 * Pass on to IP using helper stream 5580 */ 5581 error = ldi_ioctl(connp->conn_helper_info->iphs_handle, 5582 cmd, arg, mode, cr, rvalp); 5583 break; 5584 } 5585 return (error); 5586 } 5587 5588 int 5589 rawip_send(sock_lower_handle_t proto_handle, mblk_t *mp, struct nmsghdr *msg, 5590 cred_t *cr) 5591 { 5592 sin6_t *sin6; 5593 sin_t *sin = NULL; 5594 uint_t srcid; 5595 conn_t *connp = (conn_t *)proto_handle; 5596 icmp_t *icmp = connp->conn_icmp; 5597 int error = 0; 5598 icmp_stack_t *is = icmp->icmp_is; 5599 pid_t pid = curproc->p_pid; 5600 ip_xmit_attr_t *ixa; 5601 5602 ASSERT(DB_TYPE(mp) == M_DATA); 5603 5604 /* All Solaris components should pass a cred for this operation. */ 5605 ASSERT(cr != NULL); 5606 5607 /* do an implicit bind if necessary */ 5608 if (icmp->icmp_state == TS_UNBND) { 5609 error = rawip_implicit_bind(connp); 5610 /* 5611 * We could be racing with an actual bind, in which case 5612 * we would see EPROTO. We cross our fingers and try 5613 * to connect. 5614 */ 5615 if (!(error == 0 || error == EPROTO)) { 5616 freemsg(mp); 5617 return (error); 5618 } 5619 } 5620 5621 /* Protocol 255 contains full IP headers */ 5622 /* Read without holding lock */ 5623 if (icmp->icmp_hdrincl) { 5624 ASSERT(connp->conn_ipversion == IPV4_VERSION); 5625 if (mp->b_wptr - mp->b_rptr < IP_SIMPLE_HDR_LENGTH) { 5626 if (!pullupmsg(mp, IP_SIMPLE_HDR_LENGTH)) { 5627 BUMP_MIB(&is->is_rawip_mib, rawipOutErrors); 5628 freemsg(mp); 5629 return (EINVAL); 5630 } 5631 } 5632 error = icmp_output_hdrincl(connp, mp, cr, pid); 5633 if (is->is_sendto_ignerr) 5634 return (0); 5635 else 5636 return (error); 5637 } 5638 5639 /* Connected? */ 5640 if (msg->msg_name == NULL) { 5641 if (icmp->icmp_state != TS_DATA_XFER) { 5642 BUMP_MIB(&is->is_rawip_mib, rawipOutErrors); 5643 return (EDESTADDRREQ); 5644 } 5645 if (msg->msg_controllen != 0) { 5646 error = icmp_output_ancillary(connp, NULL, NULL, mp, 5647 NULL, msg, cr, pid); 5648 } else { 5649 error = icmp_output_connected(connp, mp, cr, pid); 5650 } 5651 if (is->is_sendto_ignerr) 5652 return (0); 5653 else 5654 return (error); 5655 } 5656 if (icmp->icmp_state == TS_DATA_XFER) { 5657 BUMP_MIB(&is->is_rawip_mib, rawipOutErrors); 5658 return (EISCONN); 5659 } 5660 error = proto_verify_ip_addr(connp->conn_family, 5661 (struct sockaddr *)msg->msg_name, msg->msg_namelen); 5662 if (error != 0) { 5663 BUMP_MIB(&is->is_rawip_mib, rawipOutErrors); 5664 return (error); 5665 } 5666 switch (connp->conn_family) { 5667 case AF_INET6: 5668 sin6 = (sin6_t *)msg->msg_name; 5669 5670 /* No support for mapped addresses on raw sockets */ 5671 if (IN6_IS_ADDR_V4MAPPED(&sin6->sin6_addr)) { 5672 BUMP_MIB(&is->is_rawip_mib, rawipOutErrors); 5673 return (EADDRNOTAVAIL); 5674 } 5675 srcid = sin6->__sin6_src_id; 5676 5677 /* 5678 * If the local address is a mapped address return 5679 * an error. 5680 * It would be possible to send an IPv6 packet but the 5681 * response would never make it back to the application 5682 * since it is bound to a mapped address. 5683 */ 5684 if (IN6_IS_ADDR_V4MAPPED(&connp->conn_saddr_v6)) { 5685 BUMP_MIB(&is->is_rawip_mib, rawipOutErrors); 5686 return (EADDRNOTAVAIL); 5687 } 5688 5689 if (IN6_IS_ADDR_UNSPECIFIED(&sin6->sin6_addr)) 5690 sin6->sin6_addr = ipv6_loopback; 5691 5692 /* 5693 * We have to allocate an ip_xmit_attr_t before we grab 5694 * conn_lock and we need to hold conn_lock once we've check 5695 * conn_same_as_last_v6 to handle concurrent send* calls on a 5696 * socket. 5697 */ 5698 if (msg->msg_controllen == 0) { 5699 ixa = conn_get_ixa(connp, B_FALSE); 5700 if (ixa == NULL) { 5701 BUMP_MIB(&is->is_rawip_mib, rawipOutErrors); 5702 return (ENOMEM); 5703 } 5704 } else { 5705 ixa = NULL; 5706 } 5707 mutex_enter(&connp->conn_lock); 5708 if (icmp->icmp_delayed_error != 0) { 5709 sin6_t *sin2 = (sin6_t *)&icmp->icmp_delayed_addr; 5710 5711 error = icmp->icmp_delayed_error; 5712 icmp->icmp_delayed_error = 0; 5713 5714 /* Compare IP address and family */ 5715 5716 if (IN6_ARE_ADDR_EQUAL(&sin6->sin6_addr, 5717 &sin2->sin6_addr) && 5718 sin6->sin6_family == sin2->sin6_family) { 5719 mutex_exit(&connp->conn_lock); 5720 BUMP_MIB(&is->is_rawip_mib, rawipOutErrors); 5721 if (ixa != NULL) 5722 ixa_refrele(ixa); 5723 return (error); 5724 } 5725 } 5726 if (msg->msg_controllen != 0) { 5727 mutex_exit(&connp->conn_lock); 5728 ASSERT(ixa == NULL); 5729 error = icmp_output_ancillary(connp, NULL, sin6, mp, 5730 NULL, msg, cr, pid); 5731 } else if (conn_same_as_last_v6(connp, sin6) && 5732 connp->conn_lastsrcid == srcid && 5733 ipsec_outbound_policy_current(ixa)) { 5734 /* icmp_output_lastdst drops conn_lock */ 5735 error = icmp_output_lastdst(connp, mp, cr, pid, ixa); 5736 } else { 5737 /* icmp_output_newdst drops conn_lock */ 5738 error = icmp_output_newdst(connp, mp, NULL, sin6, cr, 5739 pid, ixa); 5740 } 5741 ASSERT(MUTEX_NOT_HELD(&connp->conn_lock)); 5742 if (is->is_sendto_ignerr) 5743 return (0); 5744 else 5745 return (error); 5746 case AF_INET: 5747 sin = (sin_t *)msg->msg_name; 5748 5749 if (sin->sin_addr.s_addr == INADDR_ANY) 5750 sin->sin_addr.s_addr = htonl(INADDR_LOOPBACK); 5751 5752 /* 5753 * We have to allocate an ip_xmit_attr_t before we grab 5754 * conn_lock and we need to hold conn_lock once we've check 5755 * conn_same_as_last_v6 to handle concurrent send* on a socket. 5756 */ 5757 if (msg->msg_controllen == 0) { 5758 ixa = conn_get_ixa(connp, B_FALSE); 5759 if (ixa == NULL) { 5760 BUMP_MIB(&is->is_rawip_mib, rawipOutErrors); 5761 return (ENOMEM); 5762 } 5763 } else { 5764 ixa = NULL; 5765 } 5766 mutex_enter(&connp->conn_lock); 5767 if (icmp->icmp_delayed_error != 0) { 5768 sin_t *sin2 = (sin_t *)&icmp->icmp_delayed_addr; 5769 5770 error = icmp->icmp_delayed_error; 5771 icmp->icmp_delayed_error = 0; 5772 5773 /* Compare IP address */ 5774 5775 if (sin->sin_addr.s_addr == sin2->sin_addr.s_addr) { 5776 mutex_exit(&connp->conn_lock); 5777 BUMP_MIB(&is->is_rawip_mib, rawipOutErrors); 5778 if (ixa != NULL) 5779 ixa_refrele(ixa); 5780 return (error); 5781 } 5782 } 5783 5784 if (msg->msg_controllen != 0) { 5785 mutex_exit(&connp->conn_lock); 5786 ASSERT(ixa == NULL); 5787 error = icmp_output_ancillary(connp, sin, NULL, mp, 5788 NULL, msg, cr, pid); 5789 } else if (conn_same_as_last_v4(connp, sin) && 5790 ipsec_outbound_policy_current(ixa)) { 5791 /* icmp_output_lastdst drops conn_lock */ 5792 error = icmp_output_lastdst(connp, mp, cr, pid, ixa); 5793 } else { 5794 /* icmp_output_newdst drops conn_lock */ 5795 error = icmp_output_newdst(connp, mp, sin, NULL, cr, 5796 pid, ixa); 5797 } 5798 ASSERT(MUTEX_NOT_HELD(&connp->conn_lock)); 5799 if (is->is_sendto_ignerr) 5800 return (0); 5801 else 5802 return (error); 5803 default: 5804 return (EINVAL); 5805 } 5806 } 5807 5808 sock_downcalls_t sock_rawip_downcalls = { 5809 rawip_activate, 5810 rawip_accept, 5811 rawip_bind, 5812 rawip_listen, 5813 rawip_connect, 5814 rawip_getpeername, 5815 rawip_getsockname, 5816 rawip_getsockopt, 5817 rawip_setsockopt, 5818 rawip_send, 5819 NULL, 5820 NULL, 5821 NULL, 5822 rawip_shutdown, 5823 rawip_clr_flowctrl, 5824 rawip_ioctl, 5825 rawip_close 5826 }; 5827