1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright (c) 1991, 2010, Oracle and/or its affiliates. All rights reserved. 23 * Copyright (c) 2013 by Delphix. All rights reserved. 24 * Copyright 2014, OmniTI Computer Consulting, Inc. All rights reserved. 25 * Copyright (c) 2018, Joyent, Inc. 26 * Copyright 2024 Oxide Computer Company 27 */ 28 /* Copyright (c) 1990 Mentat Inc. */ 29 30 #include <sys/types.h> 31 #include <sys/stream.h> 32 #include <sys/stropts.h> 33 #include <sys/strlog.h> 34 #include <sys/strsun.h> 35 #define _SUN_TPI_VERSION 2 36 #include <sys/tihdr.h> 37 #include <sys/timod.h> 38 #include <sys/ddi.h> 39 #include <sys/sunddi.h> 40 #include <sys/strsubr.h> 41 #include <sys/suntpi.h> 42 #include <sys/xti_inet.h> 43 #include <sys/cmn_err.h> 44 #include <sys/kmem.h> 45 #include <sys/cred.h> 46 #include <sys/policy.h> 47 #include <sys/priv.h> 48 #include <sys/ucred.h> 49 #include <sys/zone.h> 50 51 #include <sys/sockio.h> 52 #include <sys/socket.h> 53 #include <sys/socketvar.h> 54 #include <sys/vtrace.h> 55 #include <sys/sdt.h> 56 #include <sys/debug.h> 57 #include <sys/isa_defs.h> 58 #include <sys/random.h> 59 #include <netinet/in.h> 60 #include <netinet/ip6.h> 61 #include <netinet/icmp6.h> 62 #include <netinet/udp.h> 63 64 #include <inet/common.h> 65 #include <inet/ip.h> 66 #include <inet/ip_impl.h> 67 #include <inet/ipsec_impl.h> 68 #include <inet/ip6.h> 69 #include <inet/ip_ire.h> 70 #include <inet/ip_if.h> 71 #include <inet/ip_multi.h> 72 #include <inet/ip_ndp.h> 73 #include <inet/proto_set.h> 74 #include <inet/mib2.h> 75 #include <inet/nd.h> 76 #include <inet/optcom.h> 77 #include <inet/snmpcom.h> 78 #include <inet/kstatcom.h> 79 #include <inet/ipclassifier.h> 80 81 #include <sys/tsol/label.h> 82 #include <sys/tsol/tnet.h> 83 84 #include <inet/rawip_impl.h> 85 86 #include <sys/disp.h> 87 88 /* 89 * Synchronization notes: 90 * 91 * RAWIP is MT and uses the usual kernel synchronization primitives. We use 92 * conn_lock to protect the icmp_t. 93 * 94 * Plumbing notes: 95 * ICMP is always a device driver. For compatibility with mibopen() code 96 * it is possible to I_PUSH "icmp", but that results in pushing a passthrough 97 * dummy module. 98 */ 99 static void icmp_addr_req(queue_t *q, mblk_t *mp); 100 static void icmp_tpi_bind(queue_t *q, mblk_t *mp); 101 static void icmp_bind_proto(icmp_t *icmp); 102 static int icmp_build_hdr_template(conn_t *, const in6_addr_t *, 103 const in6_addr_t *, uint32_t); 104 static void icmp_capability_req(queue_t *q, mblk_t *mp); 105 static int icmp_close(queue_t *q, int flags, cred_t *); 106 static void icmp_close_free(conn_t *); 107 static void icmp_tpi_connect(queue_t *q, mblk_t *mp); 108 static void icmp_tpi_disconnect(queue_t *q, mblk_t *mp); 109 static void icmp_err_ack(queue_t *q, mblk_t *mp, t_scalar_t t_error, 110 int sys_error); 111 static void icmp_err_ack_prim(queue_t *q, mblk_t *mp, t_scalar_t primitive, 112 t_scalar_t tlierr, int sys_error); 113 static void icmp_icmp_input(void *arg1, mblk_t *mp, void *arg2, 114 ip_recv_attr_t *); 115 static void icmp_icmp_error_ipv6(conn_t *connp, mblk_t *mp, 116 ip_recv_attr_t *); 117 static void icmp_info_req(queue_t *q, mblk_t *mp); 118 static void icmp_input(void *, mblk_t *, void *, ip_recv_attr_t *); 119 static conn_t *icmp_open(int family, cred_t *credp, int *err, int flags); 120 static int icmp_openv4(queue_t *q, dev_t *devp, int flag, int sflag, 121 cred_t *credp); 122 static int icmp_openv6(queue_t *q, dev_t *devp, int flag, int sflag, 123 cred_t *credp); 124 static boolean_t icmp_opt_allow_udr_set(t_scalar_t level, t_scalar_t name); 125 int icmp_opt_set(conn_t *connp, uint_t optset_context, 126 int level, int name, uint_t inlen, 127 uchar_t *invalp, uint_t *outlenp, uchar_t *outvalp, 128 void *thisdg_attrs, cred_t *cr); 129 int icmp_opt_get(conn_t *connp, int level, int name, 130 uchar_t *ptr); 131 static int icmp_output_newdst(conn_t *connp, mblk_t *data_mp, sin_t *sin, 132 sin6_t *sin6, cred_t *cr, pid_t pid, ip_xmit_attr_t *ixa); 133 static mblk_t *icmp_prepend_hdr(conn_t *, ip_xmit_attr_t *, const ip_pkt_t *, 134 const in6_addr_t *, const in6_addr_t *, uint32_t, mblk_t *, int *); 135 static mblk_t *icmp_prepend_header_template(conn_t *, ip_xmit_attr_t *, 136 mblk_t *, const in6_addr_t *, uint32_t, int *); 137 static int icmp_snmp_set(queue_t *q, t_scalar_t level, t_scalar_t name, 138 uchar_t *ptr, int len); 139 static void icmp_ud_err(queue_t *q, mblk_t *mp, t_scalar_t err); 140 static void icmp_tpi_unbind(queue_t *q, mblk_t *mp); 141 static int icmp_wput(queue_t *q, mblk_t *mp); 142 static int icmp_wput_fallback(queue_t *q, mblk_t *mp); 143 static void icmp_wput_other(queue_t *q, mblk_t *mp); 144 static void icmp_wput_iocdata(queue_t *q, mblk_t *mp); 145 static void icmp_wput_restricted(queue_t *q, mblk_t *mp); 146 static void icmp_ulp_recv(conn_t *, mblk_t *, uint_t); 147 148 static void *rawip_stack_init(netstackid_t stackid, netstack_t *ns); 149 static void rawip_stack_fini(netstackid_t stackid, void *arg); 150 151 static void *rawip_kstat_init(netstackid_t stackid); 152 static void rawip_kstat_fini(netstackid_t stackid, kstat_t *ksp); 153 static int rawip_kstat_update(kstat_t *kp, int rw); 154 static void rawip_stack_shutdown(netstackid_t stackid, void *arg); 155 156 /* Common routines for TPI and socket module */ 157 static conn_t *rawip_do_open(int, cred_t *, int *, int); 158 static void rawip_do_close(conn_t *); 159 static int rawip_do_bind(conn_t *, struct sockaddr *, socklen_t); 160 static int rawip_do_unbind(conn_t *); 161 static int rawip_do_connect(conn_t *, const struct sockaddr *, socklen_t, 162 cred_t *, pid_t); 163 164 int rawip_getsockname(sock_lower_handle_t, struct sockaddr *, 165 socklen_t *, cred_t *); 166 int rawip_getpeername(sock_lower_handle_t, struct sockaddr *, 167 socklen_t *, cred_t *); 168 169 static struct module_info icmp_mod_info = { 170 5707, "icmp", 1, INFPSZ, 512, 128 171 }; 172 173 /* 174 * Entry points for ICMP as a device. 175 * We have separate open functions for the /dev/icmp and /dev/icmp6 devices. 176 */ 177 static struct qinit icmprinitv4 = { 178 NULL, NULL, icmp_openv4, icmp_close, NULL, &icmp_mod_info 179 }; 180 181 static struct qinit icmprinitv6 = { 182 NULL, NULL, icmp_openv6, icmp_close, NULL, &icmp_mod_info 183 }; 184 185 static struct qinit icmpwinit = { 186 icmp_wput, ip_wsrv, NULL, NULL, NULL, &icmp_mod_info 187 }; 188 189 /* ICMP entry point during fallback */ 190 static struct qinit icmp_fallback_sock_winit = { 191 icmp_wput_fallback, NULL, NULL, NULL, NULL, &icmp_mod_info 192 }; 193 194 /* For AF_INET aka /dev/icmp */ 195 struct streamtab icmpinfov4 = { 196 &icmprinitv4, &icmpwinit 197 }; 198 199 /* For AF_INET6 aka /dev/icmp6 */ 200 struct streamtab icmpinfov6 = { 201 &icmprinitv6, &icmpwinit 202 }; 203 204 /* Default structure copied into T_INFO_ACK messages */ 205 static struct T_info_ack icmp_g_t_info_ack = { 206 T_INFO_ACK, 207 IP_MAXPACKET, /* TSDU_size. icmp allows maximum size messages. */ 208 T_INVALID, /* ETSDU_size. icmp does not support expedited data. */ 209 T_INVALID, /* CDATA_size. icmp does not support connect data. */ 210 T_INVALID, /* DDATA_size. icmp does not support disconnect data. */ 211 0, /* ADDR_size - filled in later. */ 212 0, /* OPT_size - not initialized here */ 213 IP_MAXPACKET, /* TIDU_size. icmp allows maximum size messages. */ 214 T_CLTS, /* SERV_type. icmp supports connection-less. */ 215 TS_UNBND, /* CURRENT_state. This is set from icmp_state. */ 216 (XPG4_1|SENDZERO) /* PROVIDER_flag */ 217 }; 218 219 static int 220 icmp_set_buf_prop(netstack_t *stack, cred_t *cr, mod_prop_info_t *pinfo, 221 const char *ifname, const void *pval, uint_t flags) 222 { 223 return (mod_set_buf_prop(stack->netstack_icmp->is_propinfo_tbl, 224 stack, cr, pinfo, ifname, pval, flags)); 225 } 226 227 static int 228 icmp_get_buf_prop(netstack_t *stack, mod_prop_info_t *pinfo, const char *ifname, 229 void *val, uint_t psize, uint_t flags) 230 { 231 return (mod_get_buf_prop(stack->netstack_icmp->is_propinfo_tbl, stack, 232 pinfo, ifname, val, psize, flags)); 233 } 234 235 /* 236 * All of these are alterable, within the min/max values given, at run time. 237 * 238 * Note: All those tunables which do not start with "icmp_" are Committed and 239 * therefore are public. See PSARC 2010/080. 240 */ 241 static mod_prop_info_t icmp_propinfo_tbl[] = { 242 /* tunable - 0 */ 243 { "_wroff_extra", MOD_PROTO_RAWIP, 244 mod_set_uint32, mod_get_uint32, 245 {0, 128, 32}, {32} }, 246 247 { "_ipv4_ttl", MOD_PROTO_RAWIP, 248 mod_set_uint32, mod_get_uint32, 249 {1, 255, 255}, {255} }, 250 251 { "_ipv6_hoplimit", MOD_PROTO_RAWIP, 252 mod_set_uint32, mod_get_uint32, 253 {0, IPV6_MAX_HOPS, IPV6_DEFAULT_HOPS}, 254 {IPV6_DEFAULT_HOPS} }, 255 256 { "_bsd_compat", MOD_PROTO_RAWIP, 257 mod_set_boolean, mod_get_boolean, 258 {B_TRUE}, {B_TRUE} }, 259 260 { "send_buf", MOD_PROTO_RAWIP, 261 icmp_set_buf_prop, icmp_get_buf_prop, 262 {4096, 65536, 8192}, {8192} }, 263 264 { "_xmit_lowat", MOD_PROTO_RAWIP, 265 mod_set_uint32, mod_get_uint32, 266 {0, 65536, 1024}, {1024} }, 267 268 { "recv_buf", MOD_PROTO_RAWIP, 269 icmp_set_buf_prop, icmp_get_buf_prop, 270 {4096, 65536, 8192}, {8192} }, 271 272 { "max_buf", MOD_PROTO_RAWIP, 273 mod_set_uint32, mod_get_uint32, 274 {65536, ULP_MAX_BUF, 256*1024}, {256*1024} }, 275 276 { "_pmtu_discovery", MOD_PROTO_RAWIP, 277 mod_set_boolean, mod_get_boolean, 278 {B_FALSE}, {B_FALSE} }, 279 280 { "_sendto_ignerr", MOD_PROTO_RAWIP, 281 mod_set_boolean, mod_get_boolean, 282 {B_FALSE}, {B_FALSE} }, 283 284 { "?", MOD_PROTO_RAWIP, NULL, mod_get_allprop, {0}, {0} }, 285 286 { NULL, 0, NULL, NULL, {0}, {0} } 287 }; 288 289 #define is_wroff_extra is_propinfo_tbl[0].prop_cur_uval 290 #define is_ipv4_ttl is_propinfo_tbl[1].prop_cur_uval 291 #define is_ipv6_hoplimit is_propinfo_tbl[2].prop_cur_uval 292 #define is_bsd_compat is_propinfo_tbl[3].prop_cur_bval 293 #define is_xmit_hiwat is_propinfo_tbl[4].prop_cur_uval 294 #define is_xmit_lowat is_propinfo_tbl[5].prop_cur_uval 295 #define is_recv_hiwat is_propinfo_tbl[6].prop_cur_uval 296 #define is_max_buf is_propinfo_tbl[7].prop_cur_uval 297 #define is_pmtu_discovery is_propinfo_tbl[8].prop_cur_bval 298 #define is_sendto_ignerr is_propinfo_tbl[9].prop_cur_bval 299 300 typedef union T_primitives *t_primp_t; 301 302 /* 303 * This routine is called to handle each O_T_BIND_REQ/T_BIND_REQ message 304 * passed to icmp_wput. 305 * It calls IP to verify the local IP address, and calls IP to insert 306 * the conn_t in the fanout table. 307 * If everything is ok it then sends the T_BIND_ACK back up. 308 */ 309 static void 310 icmp_tpi_bind(queue_t *q, mblk_t *mp) 311 { 312 int error; 313 struct sockaddr *sa; 314 struct T_bind_req *tbr; 315 socklen_t len; 316 sin_t *sin; 317 sin6_t *sin6; 318 icmp_t *icmp; 319 conn_t *connp = Q_TO_CONN(q); 320 mblk_t *mp1; 321 cred_t *cr; 322 323 /* 324 * All Solaris components should pass a db_credp 325 * for this TPI message, hence we ASSERT. 326 * But in case there is some other M_PROTO that looks 327 * like a TPI message sent by some other kernel 328 * component, we check and return an error. 329 */ 330 cr = msg_getcred(mp, NULL); 331 ASSERT(cr != NULL); 332 if (cr == NULL) { 333 icmp_err_ack(q, mp, TSYSERR, EINVAL); 334 return; 335 } 336 337 icmp = connp->conn_icmp; 338 if ((mp->b_wptr - mp->b_rptr) < sizeof (*tbr)) { 339 (void) mi_strlog(q, 1, SL_ERROR|SL_TRACE, 340 "icmp_bind: bad req, len %u", 341 (uint_t)(mp->b_wptr - mp->b_rptr)); 342 icmp_err_ack(q, mp, TPROTO, 0); 343 return; 344 } 345 346 if (icmp->icmp_state != TS_UNBND) { 347 (void) mi_strlog(q, 1, SL_ERROR|SL_TRACE, 348 "icmp_bind: bad state, %u", icmp->icmp_state); 349 icmp_err_ack(q, mp, TOUTSTATE, 0); 350 return; 351 } 352 353 /* 354 * Reallocate the message to make sure we have enough room for an 355 * address. 356 */ 357 mp1 = reallocb(mp, sizeof (struct T_bind_ack) + sizeof (sin6_t), 1); 358 if (mp1 == NULL) { 359 icmp_err_ack(q, mp, TSYSERR, ENOMEM); 360 return; 361 } 362 mp = mp1; 363 364 /* Reset the message type in preparation for shipping it back. */ 365 DB_TYPE(mp) = M_PCPROTO; 366 tbr = (struct T_bind_req *)mp->b_rptr; 367 len = tbr->ADDR_length; 368 switch (len) { 369 case 0: /* request for a generic port */ 370 tbr->ADDR_offset = sizeof (struct T_bind_req); 371 if (connp->conn_family == AF_INET) { 372 tbr->ADDR_length = sizeof (sin_t); 373 sin = (sin_t *)&tbr[1]; 374 *sin = sin_null; 375 sin->sin_family = AF_INET; 376 mp->b_wptr = (uchar_t *)&sin[1]; 377 sa = (struct sockaddr *)sin; 378 len = sizeof (sin_t); 379 } else { 380 ASSERT(connp->conn_family == AF_INET6); 381 tbr->ADDR_length = sizeof (sin6_t); 382 sin6 = (sin6_t *)&tbr[1]; 383 *sin6 = sin6_null; 384 sin6->sin6_family = AF_INET6; 385 mp->b_wptr = (uchar_t *)&sin6[1]; 386 sa = (struct sockaddr *)sin6; 387 len = sizeof (sin6_t); 388 } 389 break; 390 391 case sizeof (sin_t): /* Complete IPv4 address */ 392 sa = (struct sockaddr *)mi_offset_param(mp, tbr->ADDR_offset, 393 sizeof (sin_t)); 394 break; 395 396 case sizeof (sin6_t): /* Complete IPv6 address */ 397 sa = (struct sockaddr *)mi_offset_param(mp, 398 tbr->ADDR_offset, sizeof (sin6_t)); 399 break; 400 401 default: 402 (void) mi_strlog(q, 1, SL_ERROR|SL_TRACE, 403 "icmp_bind: bad ADDR_length %u", tbr->ADDR_length); 404 icmp_err_ack(q, mp, TBADADDR, 0); 405 return; 406 } 407 408 error = rawip_do_bind(connp, sa, len); 409 if (error != 0) { 410 if (error > 0) { 411 icmp_err_ack(q, mp, TSYSERR, error); 412 } else { 413 icmp_err_ack(q, mp, -error, 0); 414 } 415 } else { 416 tbr->PRIM_type = T_BIND_ACK; 417 qreply(q, mp); 418 } 419 } 420 421 static int 422 rawip_do_bind(conn_t *connp, struct sockaddr *sa, socklen_t len) 423 { 424 sin_t *sin; 425 sin6_t *sin6; 426 icmp_t *icmp = connp->conn_icmp; 427 int error = 0; 428 ip_laddr_t laddr_type = IPVL_UNICAST_UP; /* INADDR_ANY */ 429 in_port_t lport; /* Network byte order */ 430 ipaddr_t v4src; /* Set if AF_INET */ 431 in6_addr_t v6src; 432 uint_t scopeid = 0; 433 zoneid_t zoneid = IPCL_ZONEID(connp); 434 ip_stack_t *ipst = connp->conn_netstack->netstack_ip; 435 436 if (sa == NULL || !OK_32PTR((char *)sa)) { 437 return (EINVAL); 438 } 439 440 switch (len) { 441 case sizeof (sin_t): /* Complete IPv4 address */ 442 sin = (sin_t *)sa; 443 if (sin->sin_family != AF_INET || 444 connp->conn_family != AF_INET) { 445 /* TSYSERR, EAFNOSUPPORT */ 446 return (EAFNOSUPPORT); 447 } 448 v4src = sin->sin_addr.s_addr; 449 IN6_IPADDR_TO_V4MAPPED(v4src, &v6src); 450 if (v4src != INADDR_ANY) { 451 laddr_type = ip_laddr_verify_v4(v4src, zoneid, ipst, 452 B_TRUE); 453 } 454 lport = sin->sin_port; 455 break; 456 case sizeof (sin6_t): /* Complete IPv6 address */ 457 sin6 = (sin6_t *)sa; 458 if (sin6->sin6_family != AF_INET6 || 459 connp->conn_family != AF_INET6) { 460 /* TSYSERR, EAFNOSUPPORT */ 461 return (EAFNOSUPPORT); 462 } 463 /* No support for mapped addresses on raw sockets */ 464 if (IN6_IS_ADDR_V4MAPPED(&sin6->sin6_addr)) { 465 /* TSYSERR, EADDRNOTAVAIL */ 466 return (EADDRNOTAVAIL); 467 } 468 v6src = sin6->sin6_addr; 469 if (!IN6_IS_ADDR_UNSPECIFIED(&v6src)) { 470 if (IN6_IS_ADDR_LINKSCOPE(&v6src)) 471 scopeid = sin6->sin6_scope_id; 472 laddr_type = ip_laddr_verify_v6(&v6src, zoneid, ipst, 473 B_TRUE, scopeid); 474 } 475 lport = sin6->sin6_port; 476 break; 477 478 default: 479 /* TBADADDR */ 480 return (EADDRNOTAVAIL); 481 } 482 483 /* Is the local address a valid unicast, multicast, or broadcast? */ 484 if (laddr_type == IPVL_BAD) 485 return (EADDRNOTAVAIL); 486 487 /* 488 * The state must be TS_UNBND. 489 */ 490 mutex_enter(&connp->conn_lock); 491 if (icmp->icmp_state != TS_UNBND) { 492 mutex_exit(&connp->conn_lock); 493 return (-TOUTSTATE); 494 } 495 496 /* 497 * Copy the source address into our icmp structure. This address 498 * may still be zero; if so, ip will fill in the correct address 499 * each time an outbound packet is passed to it. 500 * If we are binding to a broadcast or multicast address then 501 * we just set the conn_bound_addr since we don't want to use 502 * that as the source address when sending. 503 */ 504 connp->conn_bound_addr_v6 = v6src; 505 connp->conn_laddr_v6 = v6src; 506 if (scopeid != 0) { 507 connp->conn_ixa->ixa_flags |= IXAF_SCOPEID_SET; 508 connp->conn_ixa->ixa_scopeid = scopeid; 509 connp->conn_incoming_ifindex = scopeid; 510 } else { 511 connp->conn_ixa->ixa_flags &= ~IXAF_SCOPEID_SET; 512 connp->conn_incoming_ifindex = connp->conn_bound_if; 513 } 514 515 switch (laddr_type) { 516 case IPVL_UNICAST_UP: 517 case IPVL_UNICAST_DOWN: 518 connp->conn_saddr_v6 = v6src; 519 connp->conn_mcbc_bind = B_FALSE; 520 break; 521 case IPVL_MCAST: 522 case IPVL_BCAST: 523 /* ip_set_destination will pick a source address later */ 524 connp->conn_saddr_v6 = ipv6_all_zeros; 525 connp->conn_mcbc_bind = B_TRUE; 526 break; 527 } 528 529 /* Any errors after this point should use late_error */ 530 531 /* 532 * Use sin_port/sin6_port since applications like psh use SOCK_RAW 533 * with IPPROTO_TCP. 534 */ 535 connp->conn_lport = lport; 536 connp->conn_fport = 0; 537 538 if (connp->conn_family == AF_INET) { 539 ASSERT(connp->conn_ipversion == IPV4_VERSION); 540 } else { 541 ASSERT(connp->conn_ipversion == IPV6_VERSION); 542 } 543 544 icmp->icmp_state = TS_IDLE; 545 546 /* 547 * We create an initial header template here to make a subsequent 548 * sendto have a starting point. Since conn_last_dst is zero the 549 * first sendto will always follow the 'dst changed' code path. 550 * Note that we defer massaging options and the related checksum 551 * adjustment until we have a destination address. 552 */ 553 error = icmp_build_hdr_template(connp, &connp->conn_saddr_v6, 554 &connp->conn_faddr_v6, connp->conn_flowinfo); 555 if (error != 0) { 556 mutex_exit(&connp->conn_lock); 557 goto late_error; 558 } 559 /* Just in case */ 560 connp->conn_faddr_v6 = ipv6_all_zeros; 561 connp->conn_v6lastdst = ipv6_all_zeros; 562 mutex_exit(&connp->conn_lock); 563 564 error = ip_laddr_fanout_insert(connp); 565 if (error != 0) 566 goto late_error; 567 568 /* Bind succeeded */ 569 return (0); 570 571 late_error: 572 mutex_enter(&connp->conn_lock); 573 connp->conn_saddr_v6 = ipv6_all_zeros; 574 connp->conn_bound_addr_v6 = ipv6_all_zeros; 575 connp->conn_laddr_v6 = ipv6_all_zeros; 576 if (scopeid != 0) { 577 connp->conn_ixa->ixa_flags &= ~IXAF_SCOPEID_SET; 578 connp->conn_incoming_ifindex = connp->conn_bound_if; 579 } 580 icmp->icmp_state = TS_UNBND; 581 connp->conn_v6lastdst = ipv6_all_zeros; 582 connp->conn_lport = 0; 583 584 /* Restore the header that was built above - different source address */ 585 (void) icmp_build_hdr_template(connp, &connp->conn_saddr_v6, 586 &connp->conn_faddr_v6, connp->conn_flowinfo); 587 mutex_exit(&connp->conn_lock); 588 return (error); 589 } 590 591 /* 592 * Tell IP to just bind to the protocol. 593 */ 594 static void 595 icmp_bind_proto(icmp_t *icmp) 596 { 597 conn_t *connp = icmp->icmp_connp; 598 599 mutex_enter(&connp->conn_lock); 600 connp->conn_saddr_v6 = ipv6_all_zeros; 601 connp->conn_laddr_v6 = ipv6_all_zeros; 602 connp->conn_faddr_v6 = ipv6_all_zeros; 603 connp->conn_v6lastdst = ipv6_all_zeros; 604 mutex_exit(&connp->conn_lock); 605 606 (void) ip_laddr_fanout_insert(connp); 607 } 608 609 /* 610 * This routine handles each T_CONN_REQ message passed to icmp. It 611 * associates a default destination address with the stream. 612 * 613 * After various error checks are completed, icmp_connect() lays 614 * the target address and port into the composite header template. 615 * Then we ask IP for information, including a source address if we didn't 616 * already have one. Finally we send up the T_OK_ACK reply message. 617 */ 618 static void 619 icmp_tpi_connect(queue_t *q, mblk_t *mp) 620 { 621 conn_t *connp = Q_TO_CONN(q); 622 struct T_conn_req *tcr; 623 struct sockaddr *sa; 624 socklen_t len; 625 int error; 626 cred_t *cr; 627 pid_t pid; 628 /* 629 * All Solaris components should pass a db_credp 630 * for this TPI message, hence we ASSERT. 631 * But in case there is some other M_PROTO that looks 632 * like a TPI message sent by some other kernel 633 * component, we check and return an error. 634 */ 635 cr = msg_getcred(mp, &pid); 636 ASSERT(cr != NULL); 637 if (cr == NULL) { 638 icmp_err_ack(q, mp, TSYSERR, EINVAL); 639 return; 640 } 641 642 tcr = (struct T_conn_req *)mp->b_rptr; 643 /* Sanity checks */ 644 if ((mp->b_wptr - mp->b_rptr) < sizeof (struct T_conn_req)) { 645 icmp_err_ack(q, mp, TPROTO, 0); 646 return; 647 } 648 649 if (tcr->OPT_length != 0) { 650 icmp_err_ack(q, mp, TBADOPT, 0); 651 return; 652 } 653 654 len = tcr->DEST_length; 655 656 switch (len) { 657 default: 658 icmp_err_ack(q, mp, TBADADDR, 0); 659 return; 660 case sizeof (sin_t): 661 sa = (struct sockaddr *)mi_offset_param(mp, tcr->DEST_offset, 662 sizeof (sin_t)); 663 break; 664 case sizeof (sin6_t): 665 sa = (struct sockaddr *)mi_offset_param(mp, 666 tcr->DEST_offset, sizeof (sin6_t)); 667 break; 668 } 669 670 error = proto_verify_ip_addr(connp->conn_family, sa, len); 671 if (error != 0) { 672 icmp_err_ack(q, mp, TSYSERR, error); 673 return; 674 } 675 676 error = rawip_do_connect(connp, sa, len, cr, pid); 677 if (error != 0) { 678 if (error < 0) { 679 icmp_err_ack(q, mp, -error, 0); 680 } else { 681 icmp_err_ack(q, mp, 0, error); 682 } 683 } else { 684 mblk_t *mp1; 685 686 /* 687 * We have to send a connection confirmation to 688 * keep TLI happy. 689 */ 690 if (connp->conn_family == AF_INET) { 691 mp1 = mi_tpi_conn_con(NULL, (char *)sa, 692 sizeof (sin_t), NULL, 0); 693 } else { 694 ASSERT(connp->conn_family == AF_INET6); 695 mp1 = mi_tpi_conn_con(NULL, (char *)sa, 696 sizeof (sin6_t), NULL, 0); 697 } 698 if (mp1 == NULL) { 699 icmp_err_ack(q, mp, TSYSERR, ENOMEM); 700 return; 701 } 702 703 /* 704 * Send ok_ack for T_CONN_REQ 705 */ 706 mp = mi_tpi_ok_ack_alloc(mp); 707 if (mp == NULL) { 708 /* Unable to reuse the T_CONN_REQ for the ack. */ 709 icmp_err_ack_prim(q, mp1, T_CONN_REQ, TSYSERR, ENOMEM); 710 return; 711 } 712 putnext(connp->conn_rq, mp); 713 putnext(connp->conn_rq, mp1); 714 } 715 } 716 717 static int 718 rawip_do_connect(conn_t *connp, const struct sockaddr *sa, socklen_t len, 719 cred_t *cr, pid_t pid) 720 { 721 icmp_t *icmp; 722 sin_t *sin; 723 sin6_t *sin6; 724 int error; 725 uint16_t dstport; 726 ipaddr_t v4dst; 727 in6_addr_t v6dst; 728 uint32_t flowinfo; 729 ip_xmit_attr_t *ixa; 730 ip_xmit_attr_t *oldixa; 731 uint_t scopeid = 0; 732 uint_t srcid = 0; 733 in6_addr_t v6src = connp->conn_saddr_v6; 734 735 icmp = connp->conn_icmp; 736 737 if (sa == NULL || !OK_32PTR((char *)sa)) { 738 return (EINVAL); 739 } 740 741 ASSERT(sa != NULL && len != 0); 742 sin = NULL; 743 sin6 = NULL; 744 dstport = 0; 745 flowinfo = 0; 746 v4dst = INADDR_ANY; 747 748 /* 749 * Determine packet type based on type of address passed in 750 * the request should contain an IPv4 or IPv6 address. 751 * Make sure that address family matches the type of 752 * family of the address passed down. 753 */ 754 switch (len) { 755 case sizeof (sin_t): 756 sin = (sin_t *)sa; 757 758 v4dst = sin->sin_addr.s_addr; 759 dstport = sin->sin_port; 760 IN6_IPADDR_TO_V4MAPPED(v4dst, &v6dst); 761 ASSERT(connp->conn_ipversion == IPV4_VERSION); 762 break; 763 764 case sizeof (sin6_t): 765 sin6 = (sin6_t *)sa; 766 767 /* No support for mapped addresses on raw sockets */ 768 if (IN6_IS_ADDR_V4MAPPED(&sin6->sin6_addr)) { 769 return (EADDRNOTAVAIL); 770 } 771 v6dst = sin6->sin6_addr; 772 dstport = sin6->sin6_port; 773 ASSERT(connp->conn_ipversion == IPV6_VERSION); 774 flowinfo = sin6->sin6_flowinfo; 775 if (IN6_IS_ADDR_LINKLOCAL(&sin6->sin6_addr)) 776 scopeid = sin6->sin6_scope_id; 777 srcid = sin6->__sin6_src_id; 778 if (srcid != 0 && IN6_IS_ADDR_UNSPECIFIED(&v6src)) { 779 /* Due to check above, we know sin6_addr is v6-only. */ 780 if (!ip_srcid_find_id(srcid, &v6src, IPCL_ZONEID(connp), 781 B_FALSE, connp->conn_netstack)) { 782 /* Mismatch - v6src would be v4mapped. */ 783 return (EADDRNOTAVAIL); 784 } 785 } 786 break; 787 } 788 789 /* 790 * If there is a different thread using conn_ixa then we get a new 791 * copy and cut the old one loose from conn_ixa. Otherwise we use 792 * conn_ixa and prevent any other thread from using/changing it. 793 * Once connect() is done other threads can use conn_ixa since the 794 * refcnt will be back at one. 795 * We defer updating conn_ixa until later to handle any concurrent 796 * conn_ixa_cleanup thread. 797 */ 798 ixa = conn_get_ixa(connp, B_FALSE); 799 if (ixa == NULL) 800 return (ENOMEM); 801 802 mutex_enter(&connp->conn_lock); 803 /* 804 * This icmp_t must have bound already before doing a connect. 805 * Reject if a connect is in progress (we drop conn_lock during 806 * rawip_do_connect). 807 */ 808 if (icmp->icmp_state == TS_UNBND || icmp->icmp_state == TS_WCON_CREQ) { 809 mutex_exit(&connp->conn_lock); 810 ixa_refrele(ixa); 811 return (-TOUTSTATE); 812 } 813 814 if (icmp->icmp_state == TS_DATA_XFER) { 815 /* Already connected - clear out state */ 816 if (connp->conn_mcbc_bind) 817 connp->conn_saddr_v6 = ipv6_all_zeros; 818 else 819 connp->conn_saddr_v6 = connp->conn_bound_addr_v6; 820 connp->conn_laddr_v6 = connp->conn_bound_addr_v6; 821 connp->conn_faddr_v6 = ipv6_all_zeros; 822 icmp->icmp_state = TS_IDLE; 823 } 824 825 /* 826 * Use sin_port/sin6_port since applications like psh use SOCK_RAW 827 * with IPPROTO_TCP. 828 */ 829 connp->conn_fport = dstport; 830 if (connp->conn_ipversion == IPV4_VERSION) { 831 /* 832 * Interpret a zero destination to mean loopback. 833 * Update the T_CONN_REQ (sin/sin6) since it is used to 834 * generate the T_CONN_CON. 835 */ 836 if (v4dst == INADDR_ANY) { 837 v4dst = htonl(INADDR_LOOPBACK); 838 IN6_IPADDR_TO_V4MAPPED(v4dst, &v6dst); 839 ASSERT(connp->conn_family == AF_INET); 840 sin->sin_addr.s_addr = v4dst; 841 } 842 connp->conn_faddr_v6 = v6dst; 843 connp->conn_flowinfo = 0; 844 } else { 845 ASSERT(connp->conn_ipversion == IPV6_VERSION); 846 /* 847 * Interpret a zero destination to mean loopback. 848 * Update the T_CONN_REQ (sin/sin6) since it is used to 849 * generate the T_CONN_CON. 850 */ 851 if (IN6_IS_ADDR_UNSPECIFIED(&v6dst)) { 852 v6dst = ipv6_loopback; 853 sin6->sin6_addr = v6dst; 854 } 855 connp->conn_faddr_v6 = v6dst; 856 connp->conn_flowinfo = flowinfo; 857 } 858 859 /* 860 * We update our cred/cpid based on the caller of connect 861 */ 862 if (connp->conn_cred != cr) { 863 crhold(cr); 864 crfree(connp->conn_cred); 865 connp->conn_cred = cr; 866 } 867 connp->conn_cpid = pid; 868 ASSERT(!(ixa->ixa_free_flags & IXA_FREE_CRED)); 869 ixa->ixa_cred = cr; 870 ixa->ixa_cpid = pid; 871 if (is_system_labeled()) { 872 /* We need to restart with a label based on the cred */ 873 ip_xmit_attr_restore_tsl(ixa, ixa->ixa_cred); 874 } 875 876 if (scopeid != 0) { 877 ixa->ixa_flags |= IXAF_SCOPEID_SET; 878 ixa->ixa_scopeid = scopeid; 879 connp->conn_incoming_ifindex = scopeid; 880 } else { 881 ixa->ixa_flags &= ~IXAF_SCOPEID_SET; 882 connp->conn_incoming_ifindex = connp->conn_bound_if; 883 } 884 885 /* 886 * conn_connect will drop conn_lock and reacquire it. 887 * To prevent a send* from messing with this icmp_t while the lock 888 * is dropped we set icmp_state and clear conn_v6lastdst. 889 * That will make all send* fail with EISCONN. 890 */ 891 connp->conn_v6lastdst = ipv6_all_zeros; 892 icmp->icmp_state = TS_WCON_CREQ; 893 894 error = conn_connect(connp, NULL, IPDF_ALLOW_MCBC); 895 mutex_exit(&connp->conn_lock); 896 if (error != 0) 897 goto connect_failed; 898 899 /* 900 * The addresses have been verified. Time to insert in 901 * the correct fanout list. 902 */ 903 error = ipcl_conn_insert(connp); 904 if (error != 0) 905 goto connect_failed; 906 907 mutex_enter(&connp->conn_lock); 908 error = icmp_build_hdr_template(connp, &connp->conn_saddr_v6, 909 &connp->conn_faddr_v6, connp->conn_flowinfo); 910 if (error != 0) { 911 mutex_exit(&connp->conn_lock); 912 goto connect_failed; 913 } 914 915 icmp->icmp_state = TS_DATA_XFER; 916 /* Record this as the "last" send even though we haven't sent any */ 917 connp->conn_v6lastdst = connp->conn_faddr_v6; 918 connp->conn_lastipversion = connp->conn_ipversion; 919 connp->conn_lastdstport = connp->conn_fport; 920 connp->conn_lastflowinfo = connp->conn_flowinfo; 921 connp->conn_lastscopeid = scopeid; 922 connp->conn_lastsrcid = srcid; 923 /* Also remember a source to use together with lastdst */ 924 connp->conn_v6lastsrc = v6src; 925 926 oldixa = conn_replace_ixa(connp, ixa); 927 mutex_exit(&connp->conn_lock); 928 ixa_refrele(oldixa); 929 930 ixa_refrele(ixa); 931 return (0); 932 933 connect_failed: 934 if (ixa != NULL) 935 ixa_refrele(ixa); 936 mutex_enter(&connp->conn_lock); 937 icmp->icmp_state = TS_IDLE; 938 /* In case the source address was set above */ 939 if (connp->conn_mcbc_bind) 940 connp->conn_saddr_v6 = ipv6_all_zeros; 941 else 942 connp->conn_saddr_v6 = connp->conn_bound_addr_v6; 943 connp->conn_laddr_v6 = connp->conn_bound_addr_v6; 944 connp->conn_faddr_v6 = ipv6_all_zeros; 945 connp->conn_v6lastdst = ipv6_all_zeros; 946 connp->conn_flowinfo = 0; 947 948 (void) icmp_build_hdr_template(connp, &connp->conn_saddr_v6, 949 &connp->conn_faddr_v6, connp->conn_flowinfo); 950 mutex_exit(&connp->conn_lock); 951 return (error); 952 } 953 954 static void 955 rawip_do_close(conn_t *connp) 956 { 957 ASSERT(connp != NULL && IPCL_IS_RAWIP(connp)); 958 959 ip_quiesce_conn(connp); 960 961 if (!IPCL_IS_NONSTR(connp)) { 962 qprocsoff(connp->conn_rq); 963 } 964 965 icmp_close_free(connp); 966 967 /* 968 * Now we are truly single threaded on this stream, and can 969 * delete the things hanging off the connp, and finally the connp. 970 * We removed this connp from the fanout list, it cannot be 971 * accessed thru the fanouts, and we already waited for the 972 * conn_ref to drop to 0. We are already in close, so 973 * there cannot be any other thread from the top. qprocsoff 974 * has completed, and service has completed or won't run in 975 * future. 976 */ 977 ASSERT(connp->conn_ref == 1); 978 979 if (!IPCL_IS_NONSTR(connp)) { 980 inet_minor_free(connp->conn_minor_arena, connp->conn_dev); 981 } else { 982 ip_free_helper_stream(connp); 983 } 984 985 connp->conn_ref--; 986 ipcl_conn_destroy(connp); 987 } 988 989 /* ARGSUSED */ 990 static int 991 icmp_close(queue_t *q, int flags, cred_t *credp __unused) 992 { 993 conn_t *connp; 994 995 if (flags & SO_FALLBACK) { 996 /* 997 * stream is being closed while in fallback 998 * simply free the resources that were allocated 999 */ 1000 inet_minor_free(WR(q)->q_ptr, (dev_t)(RD(q)->q_ptr)); 1001 qprocsoff(q); 1002 goto done; 1003 } 1004 1005 connp = Q_TO_CONN(q); 1006 (void) rawip_do_close(connp); 1007 done: 1008 q->q_ptr = WR(q)->q_ptr = NULL; 1009 return (0); 1010 } 1011 1012 static void 1013 icmp_close_free(conn_t *connp) 1014 { 1015 icmp_t *icmp = connp->conn_icmp; 1016 1017 if (icmp->icmp_filter != NULL) { 1018 kmem_free(icmp->icmp_filter, sizeof (icmp6_filter_t)); 1019 icmp->icmp_filter = NULL; 1020 } 1021 1022 /* 1023 * Clear any fields which the kmem_cache constructor clears. 1024 * Only icmp_connp needs to be preserved. 1025 * TBD: We should make this more efficient to avoid clearing 1026 * everything. 1027 */ 1028 ASSERT(icmp->icmp_connp == connp); 1029 bzero(icmp, sizeof (icmp_t)); 1030 icmp->icmp_connp = connp; 1031 } 1032 1033 /* 1034 * This routine handles each T_DISCON_REQ message passed to icmp 1035 * as an indicating that ICMP is no longer connected. This results 1036 * in telling IP to restore the binding to just the local address. 1037 */ 1038 static int 1039 icmp_do_disconnect(conn_t *connp) 1040 { 1041 icmp_t *icmp = connp->conn_icmp; 1042 int error; 1043 1044 mutex_enter(&connp->conn_lock); 1045 if (icmp->icmp_state != TS_DATA_XFER) { 1046 mutex_exit(&connp->conn_lock); 1047 return (-TOUTSTATE); 1048 } 1049 if (connp->conn_mcbc_bind) 1050 connp->conn_saddr_v6 = ipv6_all_zeros; 1051 else 1052 connp->conn_saddr_v6 = connp->conn_bound_addr_v6; 1053 connp->conn_laddr_v6 = connp->conn_bound_addr_v6; 1054 connp->conn_faddr_v6 = ipv6_all_zeros; 1055 icmp->icmp_state = TS_IDLE; 1056 1057 connp->conn_v6lastdst = ipv6_all_zeros; 1058 error = icmp_build_hdr_template(connp, &connp->conn_saddr_v6, 1059 &connp->conn_faddr_v6, connp->conn_flowinfo); 1060 mutex_exit(&connp->conn_lock); 1061 if (error != 0) 1062 return (error); 1063 1064 /* 1065 * Tell IP to remove the full binding and revert 1066 * to the local address binding. 1067 */ 1068 return (ip_laddr_fanout_insert(connp)); 1069 } 1070 1071 static void 1072 icmp_tpi_disconnect(queue_t *q, mblk_t *mp) 1073 { 1074 conn_t *connp = Q_TO_CONN(q); 1075 int error; 1076 1077 /* 1078 * Allocate the largest primitive we need to send back 1079 * T_error_ack is > than T_ok_ack 1080 */ 1081 mp = reallocb(mp, sizeof (struct T_error_ack), 1); 1082 if (mp == NULL) { 1083 /* Unable to reuse the T_DISCON_REQ for the ack. */ 1084 icmp_err_ack_prim(q, mp, T_DISCON_REQ, TSYSERR, ENOMEM); 1085 return; 1086 } 1087 1088 error = icmp_do_disconnect(connp); 1089 1090 if (error != 0) { 1091 if (error > 0) { 1092 icmp_err_ack(q, mp, 0, error); 1093 } else { 1094 icmp_err_ack(q, mp, -error, 0); 1095 } 1096 } else { 1097 mp = mi_tpi_ok_ack_alloc(mp); 1098 ASSERT(mp != NULL); 1099 qreply(q, mp); 1100 } 1101 } 1102 1103 static int 1104 icmp_disconnect(conn_t *connp) 1105 { 1106 int error; 1107 1108 connp->conn_dgram_errind = B_FALSE; 1109 1110 error = icmp_do_disconnect(connp); 1111 1112 if (error < 0) 1113 error = proto_tlitosyserr(-error); 1114 return (error); 1115 } 1116 1117 /* This routine creates a T_ERROR_ACK message and passes it upstream. */ 1118 static void 1119 icmp_err_ack(queue_t *q, mblk_t *mp, t_scalar_t t_error, int sys_error) 1120 { 1121 if ((mp = mi_tpi_err_ack_alloc(mp, t_error, sys_error)) != NULL) 1122 qreply(q, mp); 1123 } 1124 1125 /* Shorthand to generate and send TPI error acks to our client */ 1126 static void 1127 icmp_err_ack_prim(queue_t *q, mblk_t *mp, t_scalar_t primitive, 1128 t_scalar_t t_error, int sys_error) 1129 { 1130 struct T_error_ack *teackp; 1131 1132 if ((mp = tpi_ack_alloc(mp, sizeof (struct T_error_ack), 1133 M_PCPROTO, T_ERROR_ACK)) != NULL) { 1134 teackp = (struct T_error_ack *)mp->b_rptr; 1135 teackp->ERROR_prim = primitive; 1136 teackp->TLI_error = t_error; 1137 teackp->UNIX_error = sys_error; 1138 qreply(q, mp); 1139 } 1140 } 1141 1142 /* 1143 * icmp_icmp_input is called as conn_recvicmp to process ICMP messages. 1144 * Generates the appropriate T_UDERROR_IND for permanent (non-transient) errors. 1145 * Assumes that IP has pulled up everything up to and including the ICMP header. 1146 */ 1147 /* ARGSUSED2 */ 1148 static void 1149 icmp_icmp_input(void *arg1, mblk_t *mp, void *arg2, ip_recv_attr_t *ira) 1150 { 1151 conn_t *connp = (conn_t *)arg1; 1152 icmp_t *icmp = connp->conn_icmp; 1153 icmph_t *icmph; 1154 ipha_t *ipha; 1155 int iph_hdr_length; 1156 sin_t sin; 1157 mblk_t *mp1; 1158 int error = 0; 1159 1160 ipha = (ipha_t *)mp->b_rptr; 1161 1162 ASSERT(OK_32PTR(mp->b_rptr)); 1163 1164 if (IPH_HDR_VERSION(ipha) != IPV4_VERSION) { 1165 ASSERT(IPH_HDR_VERSION(ipha) == IPV6_VERSION); 1166 icmp_icmp_error_ipv6(connp, mp, ira); 1167 return; 1168 } 1169 ASSERT(IPH_HDR_VERSION(ipha) == IPV4_VERSION); 1170 1171 /* Skip past the outer IP and ICMP headers */ 1172 ASSERT(IPH_HDR_LENGTH(ipha) == ira->ira_ip_hdr_length); 1173 iph_hdr_length = ira->ira_ip_hdr_length; 1174 icmph = (icmph_t *)&mp->b_rptr[iph_hdr_length]; 1175 ipha = (ipha_t *)&icmph[1]; /* Inner IP header */ 1176 1177 iph_hdr_length = IPH_HDR_LENGTH(ipha); 1178 1179 switch (icmph->icmph_type) { 1180 case ICMP_DEST_UNREACHABLE: 1181 switch (icmph->icmph_code) { 1182 case ICMP_FRAGMENTATION_NEEDED: { 1183 ipha_t *ipha; 1184 ip_xmit_attr_t *ixa; 1185 /* 1186 * IP has already adjusted the path MTU. 1187 * But we need to adjust DF for IPv4. 1188 */ 1189 if (connp->conn_ipversion != IPV4_VERSION) 1190 break; 1191 1192 ixa = conn_get_ixa(connp, B_FALSE); 1193 if (ixa == NULL || ixa->ixa_ire == NULL) { 1194 /* 1195 * Some other thread holds conn_ixa. We will 1196 * redo this on the next ICMP too big. 1197 */ 1198 if (ixa != NULL) 1199 ixa_refrele(ixa); 1200 break; 1201 } 1202 (void) ip_get_pmtu(ixa); 1203 1204 mutex_enter(&connp->conn_lock); 1205 ipha = (ipha_t *)connp->conn_ht_iphc; 1206 if (ixa->ixa_flags & IXAF_PMTU_IPV4_DF) { 1207 ipha->ipha_fragment_offset_and_flags |= 1208 IPH_DF_HTONS; 1209 } else { 1210 ipha->ipha_fragment_offset_and_flags &= 1211 ~IPH_DF_HTONS; 1212 } 1213 mutex_exit(&connp->conn_lock); 1214 ixa_refrele(ixa); 1215 break; 1216 } 1217 case ICMP_PORT_UNREACHABLE: 1218 case ICMP_PROTOCOL_UNREACHABLE: 1219 error = ECONNREFUSED; 1220 break; 1221 default: 1222 /* Transient errors */ 1223 break; 1224 } 1225 break; 1226 default: 1227 /* Transient errors */ 1228 break; 1229 } 1230 if (error == 0) { 1231 freemsg(mp); 1232 return; 1233 } 1234 1235 /* 1236 * Deliver T_UDERROR_IND when the application has asked for it. 1237 * The socket layer enables this automatically when connected. 1238 */ 1239 if (!connp->conn_dgram_errind) { 1240 freemsg(mp); 1241 return; 1242 } 1243 1244 sin = sin_null; 1245 sin.sin_family = AF_INET; 1246 sin.sin_addr.s_addr = ipha->ipha_dst; 1247 1248 if (IPCL_IS_NONSTR(connp)) { 1249 mutex_enter(&connp->conn_lock); 1250 if (icmp->icmp_state == TS_DATA_XFER) { 1251 if (sin.sin_addr.s_addr == connp->conn_faddr_v4) { 1252 mutex_exit(&connp->conn_lock); 1253 (*connp->conn_upcalls->su_set_error) 1254 (connp->conn_upper_handle, error); 1255 goto done; 1256 } 1257 } else { 1258 icmp->icmp_delayed_error = error; 1259 *((sin_t *)&icmp->icmp_delayed_addr) = sin; 1260 } 1261 mutex_exit(&connp->conn_lock); 1262 } else { 1263 mp1 = mi_tpi_uderror_ind((char *)&sin, sizeof (sin_t), NULL, 0, 1264 error); 1265 if (mp1 != NULL) 1266 putnext(connp->conn_rq, mp1); 1267 } 1268 done: 1269 freemsg(mp); 1270 } 1271 1272 /* 1273 * icmp_icmp_error_ipv6 is called by icmp_icmp_error to process ICMP for IPv6. 1274 * Generates the appropriate T_UDERROR_IND for permanent (non-transient) errors. 1275 * Assumes that IP has pulled up all the extension headers as well as the 1276 * ICMPv6 header. 1277 */ 1278 static void 1279 icmp_icmp_error_ipv6(conn_t *connp, mblk_t *mp, ip_recv_attr_t *ira) 1280 { 1281 icmp6_t *icmp6; 1282 ip6_t *ip6h, *outer_ip6h; 1283 uint16_t iph_hdr_length; 1284 uint8_t *nexthdrp; 1285 sin6_t sin6; 1286 mblk_t *mp1; 1287 int error = 0; 1288 icmp_t *icmp = connp->conn_icmp; 1289 1290 outer_ip6h = (ip6_t *)mp->b_rptr; 1291 #ifdef DEBUG 1292 if (outer_ip6h->ip6_nxt != IPPROTO_ICMPV6) 1293 iph_hdr_length = ip_hdr_length_v6(mp, outer_ip6h); 1294 else 1295 iph_hdr_length = IPV6_HDR_LEN; 1296 ASSERT(iph_hdr_length == ira->ira_ip_hdr_length); 1297 #endif 1298 /* Skip past the outer IP and ICMP headers */ 1299 iph_hdr_length = ira->ira_ip_hdr_length; 1300 icmp6 = (icmp6_t *)&mp->b_rptr[iph_hdr_length]; 1301 1302 ip6h = (ip6_t *)&icmp6[1]; /* Inner IP header */ 1303 if (!ip_hdr_length_nexthdr_v6(mp, ip6h, &iph_hdr_length, &nexthdrp)) { 1304 freemsg(mp); 1305 return; 1306 } 1307 1308 switch (icmp6->icmp6_type) { 1309 case ICMP6_DST_UNREACH: 1310 switch (icmp6->icmp6_code) { 1311 case ICMP6_DST_UNREACH_NOPORT: 1312 error = ECONNREFUSED; 1313 break; 1314 case ICMP6_DST_UNREACH_ADMIN: 1315 case ICMP6_DST_UNREACH_NOROUTE: 1316 case ICMP6_DST_UNREACH_BEYONDSCOPE: 1317 case ICMP6_DST_UNREACH_ADDR: 1318 /* Transient errors */ 1319 break; 1320 default: 1321 break; 1322 } 1323 break; 1324 case ICMP6_PACKET_TOO_BIG: { 1325 struct T_unitdata_ind *tudi; 1326 struct T_opthdr *toh; 1327 size_t udi_size; 1328 mblk_t *newmp; 1329 t_scalar_t opt_length = sizeof (struct T_opthdr) + 1330 sizeof (struct ip6_mtuinfo); 1331 sin6_t *sin6; 1332 struct ip6_mtuinfo *mtuinfo; 1333 1334 /* 1335 * If the application has requested to receive path mtu 1336 * information, send up an empty message containing an 1337 * IPV6_PATHMTU ancillary data item. 1338 */ 1339 if (!connp->conn_ipv6_recvpathmtu) 1340 break; 1341 1342 udi_size = sizeof (struct T_unitdata_ind) + sizeof (sin6_t) + 1343 opt_length; 1344 if ((newmp = allocb(udi_size, BPRI_MED)) == NULL) { 1345 BUMP_MIB(&icmp->icmp_is->is_rawip_mib, rawipInErrors); 1346 break; 1347 } 1348 1349 /* 1350 * newmp->b_cont is left to NULL on purpose. This is an 1351 * empty message containing only ancillary data. 1352 */ 1353 newmp->b_datap->db_type = M_PROTO; 1354 tudi = (struct T_unitdata_ind *)newmp->b_rptr; 1355 newmp->b_wptr = (uchar_t *)tudi + udi_size; 1356 tudi->PRIM_type = T_UNITDATA_IND; 1357 tudi->SRC_length = sizeof (sin6_t); 1358 tudi->SRC_offset = sizeof (struct T_unitdata_ind); 1359 tudi->OPT_offset = tudi->SRC_offset + sizeof (sin6_t); 1360 tudi->OPT_length = opt_length; 1361 1362 sin6 = (sin6_t *)&tudi[1]; 1363 bzero(sin6, sizeof (sin6_t)); 1364 sin6->sin6_family = AF_INET6; 1365 sin6->sin6_addr = connp->conn_faddr_v6; 1366 1367 toh = (struct T_opthdr *)&sin6[1]; 1368 toh->level = IPPROTO_IPV6; 1369 toh->name = IPV6_PATHMTU; 1370 toh->len = opt_length; 1371 toh->status = 0; 1372 1373 mtuinfo = (struct ip6_mtuinfo *)&toh[1]; 1374 bzero(mtuinfo, sizeof (struct ip6_mtuinfo)); 1375 mtuinfo->ip6m_addr.sin6_family = AF_INET6; 1376 mtuinfo->ip6m_addr.sin6_addr = ip6h->ip6_dst; 1377 mtuinfo->ip6m_mtu = icmp6->icmp6_mtu; 1378 /* 1379 * We've consumed everything we need from the original 1380 * message. Free it, then send our empty message. 1381 */ 1382 freemsg(mp); 1383 icmp_ulp_recv(connp, newmp, msgdsize(newmp)); 1384 return; 1385 } 1386 case ICMP6_TIME_EXCEEDED: 1387 /* Transient errors */ 1388 break; 1389 case ICMP6_PARAM_PROB: 1390 /* If this corresponds to an ICMP_PROTOCOL_UNREACHABLE */ 1391 if (icmp6->icmp6_code == ICMP6_PARAMPROB_NEXTHEADER && 1392 (uchar_t *)ip6h + icmp6->icmp6_pptr == 1393 (uchar_t *)nexthdrp) { 1394 error = ECONNREFUSED; 1395 break; 1396 } 1397 break; 1398 } 1399 if (error == 0) { 1400 freemsg(mp); 1401 return; 1402 } 1403 1404 /* 1405 * Deliver T_UDERROR_IND when the application has asked for it. 1406 * The socket layer enables this automatically when connected. 1407 */ 1408 if (!connp->conn_dgram_errind) { 1409 freemsg(mp); 1410 return; 1411 } 1412 1413 sin6 = sin6_null; 1414 sin6.sin6_family = AF_INET6; 1415 sin6.sin6_addr = ip6h->ip6_dst; 1416 sin6.sin6_flowinfo = ip6h->ip6_vcf & ~IPV6_VERS_AND_FLOW_MASK; 1417 if (IPCL_IS_NONSTR(connp)) { 1418 mutex_enter(&connp->conn_lock); 1419 if (icmp->icmp_state == TS_DATA_XFER) { 1420 if (IN6_ARE_ADDR_EQUAL(&sin6.sin6_addr, 1421 &connp->conn_faddr_v6)) { 1422 mutex_exit(&connp->conn_lock); 1423 (*connp->conn_upcalls->su_set_error) 1424 (connp->conn_upper_handle, error); 1425 goto done; 1426 } 1427 } else { 1428 icmp->icmp_delayed_error = error; 1429 *((sin6_t *)&icmp->icmp_delayed_addr) = sin6; 1430 } 1431 mutex_exit(&connp->conn_lock); 1432 } else { 1433 mp1 = mi_tpi_uderror_ind((char *)&sin6, sizeof (sin6_t), 1434 NULL, 0, error); 1435 if (mp1 != NULL) 1436 putnext(connp->conn_rq, mp1); 1437 } 1438 done: 1439 freemsg(mp); 1440 } 1441 1442 /* 1443 * This routine responds to T_ADDR_REQ messages. It is called by icmp_wput. 1444 * The local address is filled in if endpoint is bound. The remote address 1445 * is filled in if remote address has been precified ("connected endpoint") 1446 * (The concept of connected CLTS sockets is alien to published TPI 1447 * but we support it anyway). 1448 */ 1449 static void 1450 icmp_addr_req(queue_t *q, mblk_t *mp) 1451 { 1452 struct sockaddr *sa; 1453 mblk_t *ackmp; 1454 struct T_addr_ack *taa; 1455 icmp_t *icmp = Q_TO_ICMP(q); 1456 conn_t *connp = icmp->icmp_connp; 1457 uint_t addrlen; 1458 1459 /* Make it large enough for worst case */ 1460 ackmp = reallocb(mp, sizeof (struct T_addr_ack) + 1461 2 * sizeof (sin6_t), 1); 1462 if (ackmp == NULL) { 1463 icmp_err_ack(q, mp, TSYSERR, ENOMEM); 1464 return; 1465 } 1466 taa = (struct T_addr_ack *)ackmp->b_rptr; 1467 1468 bzero(taa, sizeof (struct T_addr_ack)); 1469 ackmp->b_wptr = (uchar_t *)&taa[1]; 1470 1471 taa->PRIM_type = T_ADDR_ACK; 1472 ackmp->b_datap->db_type = M_PCPROTO; 1473 1474 if (connp->conn_family == AF_INET) 1475 addrlen = sizeof (sin_t); 1476 else 1477 addrlen = sizeof (sin6_t); 1478 1479 mutex_enter(&connp->conn_lock); 1480 /* 1481 * Note: Following code assumes 32 bit alignment of basic 1482 * data structures like sin_t and struct T_addr_ack. 1483 */ 1484 if (icmp->icmp_state != TS_UNBND) { 1485 /* 1486 * Fill in local address first 1487 */ 1488 taa->LOCADDR_offset = sizeof (*taa); 1489 taa->LOCADDR_length = addrlen; 1490 sa = (struct sockaddr *)&taa[1]; 1491 (void) conn_getsockname(connp, sa, &addrlen); 1492 ackmp->b_wptr += addrlen; 1493 } 1494 if (icmp->icmp_state == TS_DATA_XFER) { 1495 /* 1496 * connected, fill remote address too 1497 */ 1498 taa->REMADDR_length = addrlen; 1499 /* assumed 32-bit alignment */ 1500 taa->REMADDR_offset = taa->LOCADDR_offset + taa->LOCADDR_length; 1501 sa = (struct sockaddr *)(ackmp->b_rptr + taa->REMADDR_offset); 1502 (void) conn_getpeername(connp, sa, &addrlen); 1503 ackmp->b_wptr += addrlen; 1504 } 1505 mutex_exit(&connp->conn_lock); 1506 ASSERT(ackmp->b_wptr <= ackmp->b_datap->db_lim); 1507 qreply(q, ackmp); 1508 } 1509 1510 static void 1511 icmp_copy_info(struct T_info_ack *tap, icmp_t *icmp) 1512 { 1513 conn_t *connp = icmp->icmp_connp; 1514 1515 *tap = icmp_g_t_info_ack; 1516 1517 if (connp->conn_family == AF_INET6) 1518 tap->ADDR_size = sizeof (sin6_t); 1519 else 1520 tap->ADDR_size = sizeof (sin_t); 1521 tap->CURRENT_state = icmp->icmp_state; 1522 tap->OPT_size = icmp_max_optsize; 1523 } 1524 1525 static void 1526 icmp_do_capability_ack(icmp_t *icmp, struct T_capability_ack *tcap, 1527 t_uscalar_t cap_bits1) 1528 { 1529 tcap->CAP_bits1 = 0; 1530 1531 if (cap_bits1 & TC1_INFO) { 1532 icmp_copy_info(&tcap->INFO_ack, icmp); 1533 tcap->CAP_bits1 |= TC1_INFO; 1534 } 1535 } 1536 1537 /* 1538 * This routine responds to T_CAPABILITY_REQ messages. It is called by 1539 * icmp_wput. Much of the T_CAPABILITY_ACK information is copied from 1540 * icmp_g_t_info_ack. The current state of the stream is copied from 1541 * icmp_state. 1542 */ 1543 static void 1544 icmp_capability_req(queue_t *q, mblk_t *mp) 1545 { 1546 icmp_t *icmp = Q_TO_ICMP(q); 1547 t_uscalar_t cap_bits1; 1548 struct T_capability_ack *tcap; 1549 1550 cap_bits1 = ((struct T_capability_req *)mp->b_rptr)->CAP_bits1; 1551 1552 mp = tpi_ack_alloc(mp, sizeof (struct T_capability_ack), 1553 mp->b_datap->db_type, T_CAPABILITY_ACK); 1554 if (!mp) 1555 return; 1556 1557 tcap = (struct T_capability_ack *)mp->b_rptr; 1558 1559 icmp_do_capability_ack(icmp, tcap, cap_bits1); 1560 1561 qreply(q, mp); 1562 } 1563 1564 /* 1565 * This routine responds to T_INFO_REQ messages. It is called by icmp_wput. 1566 * Most of the T_INFO_ACK information is copied from icmp_g_t_info_ack. 1567 * The current state of the stream is copied from icmp_state. 1568 */ 1569 static void 1570 icmp_info_req(queue_t *q, mblk_t *mp) 1571 { 1572 icmp_t *icmp = Q_TO_ICMP(q); 1573 1574 /* Create a T_INFO_ACK message. */ 1575 mp = tpi_ack_alloc(mp, sizeof (struct T_info_ack), M_PCPROTO, 1576 T_INFO_ACK); 1577 if (!mp) 1578 return; 1579 icmp_copy_info((struct T_info_ack *)mp->b_rptr, icmp); 1580 qreply(q, mp); 1581 } 1582 1583 static int 1584 icmp_tpi_open(queue_t *q, dev_t *devp, int flag, int sflag, cred_t *credp, 1585 int family) 1586 { 1587 conn_t *connp; 1588 dev_t conn_dev; 1589 int error; 1590 1591 /* If the stream is already open, return immediately. */ 1592 if (q->q_ptr != NULL) 1593 return (0); 1594 1595 if (sflag == MODOPEN) 1596 return (EINVAL); 1597 1598 /* 1599 * Since ICMP is not used so heavily, allocating from the small 1600 * arena should be sufficient. 1601 */ 1602 if ((conn_dev = inet_minor_alloc(ip_minor_arena_sa)) == 0) { 1603 return (EBUSY); 1604 } 1605 1606 if (flag & SO_FALLBACK) { 1607 /* 1608 * Non streams socket needs a stream to fallback to 1609 */ 1610 RD(q)->q_ptr = (void *)conn_dev; 1611 WR(q)->q_qinfo = &icmp_fallback_sock_winit; 1612 WR(q)->q_ptr = (void *)ip_minor_arena_sa; 1613 qprocson(q); 1614 return (0); 1615 } 1616 1617 connp = rawip_do_open(family, credp, &error, KM_SLEEP); 1618 if (connp == NULL) { 1619 ASSERT(error != 0); 1620 inet_minor_free(ip_minor_arena_sa, conn_dev); 1621 return (error); 1622 } 1623 1624 *devp = makedevice(getemajor(*devp), (minor_t)conn_dev); 1625 connp->conn_dev = conn_dev; 1626 connp->conn_minor_arena = ip_minor_arena_sa; 1627 1628 /* 1629 * Initialize the icmp_t structure for this stream. 1630 */ 1631 q->q_ptr = connp; 1632 WR(q)->q_ptr = connp; 1633 connp->conn_rq = q; 1634 connp->conn_wq = WR(q); 1635 1636 WR(q)->q_hiwat = connp->conn_sndbuf; 1637 WR(q)->q_lowat = connp->conn_sndlowat; 1638 1639 qprocson(q); 1640 1641 /* Set the Stream head write offset. */ 1642 (void) proto_set_tx_wroff(q, connp, connp->conn_wroff); 1643 (void) proto_set_rx_hiwat(connp->conn_rq, connp, connp->conn_rcvbuf); 1644 1645 mutex_enter(&connp->conn_lock); 1646 connp->conn_state_flags &= ~CONN_INCIPIENT; 1647 mutex_exit(&connp->conn_lock); 1648 1649 icmp_bind_proto(connp->conn_icmp); 1650 1651 return (0); 1652 } 1653 1654 /* For /dev/icmp aka AF_INET open */ 1655 static int 1656 icmp_openv4(queue_t *q, dev_t *devp, int flag, int sflag, cred_t *credp) 1657 { 1658 return (icmp_tpi_open(q, devp, flag, sflag, credp, AF_INET)); 1659 } 1660 1661 /* For /dev/icmp6 aka AF_INET6 open */ 1662 static int 1663 icmp_openv6(queue_t *q, dev_t *devp, int flag, int sflag, cred_t *credp) 1664 { 1665 return (icmp_tpi_open(q, devp, flag, sflag, credp, AF_INET6)); 1666 } 1667 1668 /* 1669 * This is the open routine for icmp. It allocates a icmp_t structure for 1670 * the stream and, on the first open of the module, creates an ND table. 1671 */ 1672 static conn_t * 1673 rawip_do_open(int family, cred_t *credp, int *err, int flags) 1674 { 1675 icmp_t *icmp; 1676 conn_t *connp; 1677 zoneid_t zoneid; 1678 netstack_t *ns; 1679 icmp_stack_t *is; 1680 int len; 1681 boolean_t isv6 = B_FALSE; 1682 1683 *err = secpolicy_net_icmpaccess(credp); 1684 if (*err != 0) 1685 return (NULL); 1686 1687 if (family == AF_INET6) 1688 isv6 = B_TRUE; 1689 1690 ns = netstack_find_by_cred(credp); 1691 ASSERT(ns != NULL); 1692 is = ns->netstack_icmp; 1693 ASSERT(is != NULL); 1694 1695 /* 1696 * For exclusive stacks we set the zoneid to zero 1697 * to make ICMP operate as if in the global zone. 1698 */ 1699 if (ns->netstack_stackid != GLOBAL_NETSTACKID) 1700 zoneid = GLOBAL_ZONEID; 1701 else 1702 zoneid = crgetzoneid(credp); 1703 1704 ASSERT(flags == KM_SLEEP || flags == KM_NOSLEEP); 1705 1706 connp = ipcl_conn_create(IPCL_RAWIPCONN, flags, ns); 1707 icmp = connp->conn_icmp; 1708 1709 /* 1710 * ipcl_conn_create did a netstack_hold. Undo the hold that was 1711 * done by netstack_find_by_cred() 1712 */ 1713 netstack_rele(ns); 1714 1715 /* 1716 * Since this conn_t/icmp_t is not yet visible to anybody else we don't 1717 * need to lock anything. 1718 */ 1719 ASSERT(connp->conn_proto == IPPROTO_ICMP); 1720 ASSERT(connp->conn_icmp == icmp); 1721 ASSERT(icmp->icmp_connp == connp); 1722 1723 /* Set the initial state of the stream and the privilege status. */ 1724 icmp->icmp_state = TS_UNBND; 1725 connp->conn_ixa->ixa_flags |= IXAF_VERIFY_SOURCE; 1726 if (isv6) { 1727 connp->conn_family = AF_INET6; 1728 connp->conn_ipversion = IPV6_VERSION; 1729 connp->conn_ixa->ixa_flags &= ~IXAF_IS_IPV4; 1730 connp->conn_proto = IPPROTO_ICMPV6; 1731 /* May be changed by a SO_PROTOTYPE socket option. */ 1732 connp->conn_proto = IPPROTO_ICMPV6; 1733 connp->conn_ixa->ixa_protocol = connp->conn_proto; 1734 connp->conn_ixa->ixa_raw_cksum_offset = 2; 1735 connp->conn_default_ttl = is->is_ipv6_hoplimit; 1736 len = sizeof (ip6_t); 1737 } else { 1738 connp->conn_family = AF_INET; 1739 connp->conn_ipversion = IPV4_VERSION; 1740 connp->conn_ixa->ixa_flags |= IXAF_IS_IPV4; 1741 /* May be changed by a SO_PROTOTYPE socket option. */ 1742 connp->conn_proto = IPPROTO_ICMP; 1743 connp->conn_ixa->ixa_protocol = connp->conn_proto; 1744 connp->conn_default_ttl = is->is_ipv4_ttl; 1745 len = sizeof (ipha_t); 1746 } 1747 connp->conn_xmit_ipp.ipp_unicast_hops = connp->conn_default_ttl; 1748 1749 connp->conn_ixa->ixa_multicast_ttl = IP_DEFAULT_MULTICAST_TTL; 1750 1751 /* 1752 * For the socket of protocol IPPROTO_RAW or when IP_HDRINCL is set, 1753 * the checksum is provided in the pre-built packet. We clear 1754 * IXAF_SET_ULP_CKSUM to tell IP that the application has sent a 1755 * complete IP header and not to compute the transport checksum. 1756 */ 1757 connp->conn_ixa->ixa_flags |= IXAF_MULTICAST_LOOP | IXAF_SET_ULP_CKSUM; 1758 /* conn_allzones can not be set this early, hence no IPCL_ZONEID */ 1759 connp->conn_ixa->ixa_zoneid = zoneid; 1760 1761 connp->conn_zoneid = zoneid; 1762 1763 /* 1764 * If the caller has the process-wide flag set, then default to MAC 1765 * exempt mode. This allows read-down to unlabeled hosts. 1766 */ 1767 if (getpflags(NET_MAC_AWARE, credp) != 0) 1768 connp->conn_mac_mode = CONN_MAC_AWARE; 1769 1770 connp->conn_zone_is_global = (crgetzoneid(credp) == GLOBAL_ZONEID); 1771 1772 icmp->icmp_is = is; 1773 1774 connp->conn_rcvbuf = is->is_recv_hiwat; 1775 connp->conn_sndbuf = is->is_xmit_hiwat; 1776 connp->conn_sndlowat = is->is_xmit_lowat; 1777 connp->conn_rcvlowat = icmp_mod_info.mi_lowat; 1778 1779 connp->conn_wroff = len + is->is_wroff_extra; 1780 connp->conn_so_type = SOCK_RAW; 1781 1782 connp->conn_recv = icmp_input; 1783 connp->conn_recvicmp = icmp_icmp_input; 1784 crhold(credp); 1785 connp->conn_cred = credp; 1786 connp->conn_cpid = curproc->p_pid; 1787 connp->conn_open_time = ddi_get_lbolt64(); 1788 /* Cache things in ixa without an extra refhold */ 1789 ASSERT(!(connp->conn_ixa->ixa_free_flags & IXA_FREE_CRED)); 1790 connp->conn_ixa->ixa_cred = connp->conn_cred; 1791 connp->conn_ixa->ixa_cpid = connp->conn_cpid; 1792 if (is_system_labeled()) 1793 connp->conn_ixa->ixa_tsl = crgetlabel(connp->conn_cred); 1794 1795 connp->conn_flow_cntrld = B_FALSE; 1796 1797 if (is->is_pmtu_discovery) 1798 connp->conn_ixa->ixa_flags |= IXAF_PMTU_DISCOVERY; 1799 1800 return (connp); 1801 } 1802 1803 /* 1804 * Which ICMP options OK to set through T_UNITDATA_REQ... 1805 */ 1806 /* ARGSUSED */ 1807 static boolean_t 1808 icmp_opt_allow_udr_set(t_scalar_t level, t_scalar_t name) 1809 { 1810 return (B_TRUE); 1811 } 1812 1813 /* 1814 * This routine gets default values of certain options whose default 1815 * values are maintained by protcol specific code 1816 */ 1817 int 1818 icmp_opt_default(queue_t *q, t_scalar_t level, t_scalar_t name, uchar_t *ptr) 1819 { 1820 icmp_t *icmp = Q_TO_ICMP(q); 1821 icmp_stack_t *is = icmp->icmp_is; 1822 int *i1 = (int *)ptr; 1823 1824 switch (level) { 1825 case IPPROTO_IP: 1826 switch (name) { 1827 case IP_MULTICAST_TTL: 1828 *ptr = (uchar_t)IP_DEFAULT_MULTICAST_TTL; 1829 return (sizeof (uchar_t)); 1830 case IP_MULTICAST_LOOP: 1831 *ptr = (uchar_t)IP_DEFAULT_MULTICAST_LOOP; 1832 return (sizeof (uchar_t)); 1833 } 1834 break; 1835 case IPPROTO_IPV6: 1836 switch (name) { 1837 case IPV6_MULTICAST_HOPS: 1838 *i1 = IP_DEFAULT_MULTICAST_TTL; 1839 return (sizeof (int)); 1840 case IPV6_MULTICAST_LOOP: 1841 *i1 = IP_DEFAULT_MULTICAST_LOOP; 1842 return (sizeof (int)); 1843 case IPV6_UNICAST_HOPS: 1844 *i1 = is->is_ipv6_hoplimit; 1845 return (sizeof (int)); 1846 } 1847 break; 1848 case IPPROTO_ICMPV6: 1849 switch (name) { 1850 case ICMP6_FILTER: 1851 /* Make it look like "pass all" */ 1852 ICMP6_FILTER_SETPASSALL((icmp6_filter_t *)ptr); 1853 return (sizeof (icmp6_filter_t)); 1854 } 1855 break; 1856 } 1857 return (-1); 1858 } 1859 1860 /* 1861 * This routine retrieves the current status of socket options. 1862 * It returns the size of the option retrieved, or -1. 1863 */ 1864 int 1865 icmp_opt_get(conn_t *connp, int level, int name, uchar_t *ptr) 1866 { 1867 icmp_t *icmp = connp->conn_icmp; 1868 int *i1 = (int *)ptr; 1869 conn_opt_arg_t coas; 1870 int retval; 1871 1872 coas.coa_connp = connp; 1873 coas.coa_ixa = connp->conn_ixa; 1874 coas.coa_ipp = &connp->conn_xmit_ipp; 1875 coas.coa_ancillary = B_FALSE; 1876 coas.coa_changed = 0; 1877 1878 /* 1879 * We assume that the optcom framework has checked for the set 1880 * of levels and names that are supported, hence we don't worry 1881 * about rejecting based on that. 1882 * First check for ICMP specific handling, then pass to common routine. 1883 */ 1884 switch (level) { 1885 case IPPROTO_IP: 1886 /* 1887 * Only allow IPv4 option processing on IPv4 sockets. 1888 */ 1889 if (connp->conn_family != AF_INET) 1890 return (-1); 1891 1892 switch (name) { 1893 case IP_OPTIONS: 1894 case T_IP_OPTIONS: 1895 /* Options are passed up with each packet */ 1896 return (0); 1897 case IP_HDRINCL: 1898 mutex_enter(&connp->conn_lock); 1899 *i1 = (int)icmp->icmp_hdrincl; 1900 mutex_exit(&connp->conn_lock); 1901 return (sizeof (int)); 1902 } 1903 break; 1904 1905 case IPPROTO_IPV6: 1906 /* 1907 * Only allow IPv6 option processing on native IPv6 sockets. 1908 */ 1909 if (connp->conn_family != AF_INET6) 1910 return (-1); 1911 1912 switch (name) { 1913 case IPV6_CHECKSUM: 1914 /* 1915 * Return offset or -1 if no checksum offset. 1916 * Does not apply to IPPROTO_ICMPV6 1917 */ 1918 if (connp->conn_proto == IPPROTO_ICMPV6) 1919 return (-1); 1920 1921 mutex_enter(&connp->conn_lock); 1922 if (connp->conn_ixa->ixa_flags & IXAF_SET_RAW_CKSUM) 1923 *i1 = connp->conn_ixa->ixa_raw_cksum_offset; 1924 else 1925 *i1 = -1; 1926 mutex_exit(&connp->conn_lock); 1927 return (sizeof (int)); 1928 } 1929 break; 1930 1931 case IPPROTO_ICMPV6: 1932 /* 1933 * Only allow IPv6 option processing on native IPv6 sockets. 1934 */ 1935 if (connp->conn_family != AF_INET6) 1936 return (-1); 1937 1938 if (connp->conn_proto != IPPROTO_ICMPV6) 1939 return (-1); 1940 1941 switch (name) { 1942 case ICMP6_FILTER: 1943 mutex_enter(&connp->conn_lock); 1944 if (icmp->icmp_filter == NULL) { 1945 /* Make it look like "pass all" */ 1946 ICMP6_FILTER_SETPASSALL((icmp6_filter_t *)ptr); 1947 } else { 1948 (void) bcopy(icmp->icmp_filter, ptr, 1949 sizeof (icmp6_filter_t)); 1950 } 1951 mutex_exit(&connp->conn_lock); 1952 return (sizeof (icmp6_filter_t)); 1953 } 1954 } 1955 mutex_enter(&connp->conn_lock); 1956 retval = conn_opt_get(&coas, level, name, ptr); 1957 mutex_exit(&connp->conn_lock); 1958 return (retval); 1959 } 1960 1961 /* 1962 * This routine retrieves the current status of socket options. 1963 * It returns the size of the option retrieved, or -1. 1964 */ 1965 int 1966 icmp_tpi_opt_get(queue_t *q, int level, int name, uchar_t *ptr) 1967 { 1968 conn_t *connp = Q_TO_CONN(q); 1969 int err; 1970 1971 err = icmp_opt_get(connp, level, name, ptr); 1972 return (err); 1973 } 1974 1975 /* 1976 * This routine sets socket options. 1977 */ 1978 int 1979 icmp_do_opt_set(conn_opt_arg_t *coa, int level, int name, 1980 uint_t inlen, uchar_t *invalp, cred_t *cr, boolean_t checkonly) 1981 { 1982 conn_t *connp = coa->coa_connp; 1983 ip_xmit_attr_t *ixa = coa->coa_ixa; 1984 icmp_t *icmp = connp->conn_icmp; 1985 icmp_stack_t *is = icmp->icmp_is; 1986 int *i1 = (int *)invalp; 1987 boolean_t onoff = (*i1 == 0) ? 0 : 1; 1988 int error; 1989 1990 ASSERT(MUTEX_NOT_HELD(&coa->coa_connp->conn_lock)); 1991 1992 /* 1993 * For fixed length options, no sanity check 1994 * of passed in length is done. It is assumed *_optcom_req() 1995 * routines do the right thing. 1996 */ 1997 1998 switch (level) { 1999 case SOL_SOCKET: 2000 switch (name) { 2001 case SO_PROTOTYPE: 2002 if ((*i1 & 0xFF) != IPPROTO_ICMP && 2003 (*i1 & 0xFF) != IPPROTO_ICMPV6 && 2004 secpolicy_net_rawaccess(cr) != 0) { 2005 return (EACCES); 2006 } 2007 if (checkonly) 2008 break; 2009 2010 mutex_enter(&connp->conn_lock); 2011 connp->conn_proto = *i1 & 0xFF; 2012 ixa->ixa_protocol = connp->conn_proto; 2013 if ((connp->conn_proto == IPPROTO_RAW || 2014 connp->conn_proto == IPPROTO_IGMP) && 2015 connp->conn_family == AF_INET) { 2016 icmp->icmp_hdrincl = 1; 2017 ixa->ixa_flags &= ~IXAF_SET_ULP_CKSUM; 2018 } else if (connp->conn_proto == IPPROTO_UDP || 2019 connp->conn_proto == IPPROTO_TCP || 2020 connp->conn_proto == IPPROTO_SCTP) { 2021 /* Used by test applications like psh */ 2022 icmp->icmp_hdrincl = 0; 2023 ixa->ixa_flags &= ~IXAF_SET_ULP_CKSUM; 2024 } else { 2025 icmp->icmp_hdrincl = 0; 2026 ixa->ixa_flags |= IXAF_SET_ULP_CKSUM; 2027 } 2028 2029 if (connp->conn_family == AF_INET6 && 2030 connp->conn_proto == IPPROTO_ICMPV6) { 2031 /* Set offset for icmp6_cksum */ 2032 ixa->ixa_flags &= ~IXAF_SET_RAW_CKSUM; 2033 ixa->ixa_raw_cksum_offset = 2; 2034 } 2035 if (icmp->icmp_filter != NULL && 2036 connp->conn_proto != IPPROTO_ICMPV6) { 2037 kmem_free(icmp->icmp_filter, 2038 sizeof (icmp6_filter_t)); 2039 icmp->icmp_filter = NULL; 2040 } 2041 mutex_exit(&connp->conn_lock); 2042 2043 coa->coa_changed |= COA_HEADER_CHANGED; 2044 /* 2045 * For SCTP, we don't use icmp_bind_proto() for 2046 * raw socket binding. 2047 */ 2048 if (connp->conn_proto == IPPROTO_SCTP) 2049 return (0); 2050 2051 coa->coa_changed |= COA_ICMP_BIND_NEEDED; 2052 return (0); 2053 2054 case SO_SNDBUF: 2055 if (*i1 > is->is_max_buf) { 2056 return (ENOBUFS); 2057 } 2058 break; 2059 case SO_RCVBUF: 2060 if (*i1 > is->is_max_buf) { 2061 return (ENOBUFS); 2062 } 2063 break; 2064 } 2065 break; 2066 2067 case IPPROTO_IP: 2068 /* 2069 * Only allow IPv4 option processing on IPv4 sockets. 2070 */ 2071 if (connp->conn_family != AF_INET) 2072 return (EINVAL); 2073 2074 switch (name) { 2075 case IP_HDRINCL: 2076 if (!checkonly) { 2077 mutex_enter(&connp->conn_lock); 2078 icmp->icmp_hdrincl = onoff; 2079 if (onoff) 2080 ixa->ixa_flags &= ~IXAF_SET_ULP_CKSUM; 2081 else 2082 ixa->ixa_flags |= IXAF_SET_ULP_CKSUM; 2083 mutex_exit(&connp->conn_lock); 2084 } 2085 break; 2086 } 2087 break; 2088 2089 case IPPROTO_IPV6: 2090 if (connp->conn_family != AF_INET6) 2091 return (EINVAL); 2092 2093 switch (name) { 2094 case IPV6_CHECKSUM: 2095 /* 2096 * Integer offset into the user data of where the 2097 * checksum is located. 2098 * Offset of -1 disables option. 2099 * Does not apply to IPPROTO_ICMPV6. 2100 */ 2101 if (connp->conn_proto == IPPROTO_ICMPV6 || 2102 coa->coa_ancillary) { 2103 return (EINVAL); 2104 } 2105 if ((*i1 != -1) && ((*i1 < 0) || (*i1 & 0x1) != 0)) { 2106 /* Negative or not 16 bit aligned offset */ 2107 return (EINVAL); 2108 } 2109 if (checkonly) 2110 break; 2111 2112 mutex_enter(&connp->conn_lock); 2113 if (*i1 == -1) { 2114 ixa->ixa_flags &= ~IXAF_SET_RAW_CKSUM; 2115 ixa->ixa_raw_cksum_offset = 0; 2116 ixa->ixa_flags &= ~IXAF_SET_ULP_CKSUM; 2117 } else { 2118 ixa->ixa_flags |= IXAF_SET_RAW_CKSUM; 2119 ixa->ixa_raw_cksum_offset = *i1; 2120 ixa->ixa_flags |= IXAF_SET_ULP_CKSUM; 2121 } 2122 mutex_exit(&connp->conn_lock); 2123 break; 2124 } 2125 break; 2126 2127 case IPPROTO_ICMPV6: 2128 /* 2129 * Only allow IPv6 option processing on IPv6 sockets. 2130 */ 2131 if (connp->conn_family != AF_INET6) 2132 return (EINVAL); 2133 if (connp->conn_proto != IPPROTO_ICMPV6) 2134 return (EINVAL); 2135 2136 switch (name) { 2137 case ICMP6_FILTER: 2138 if (checkonly) 2139 break; 2140 2141 if ((inlen != 0) && 2142 (inlen != sizeof (icmp6_filter_t))) 2143 return (EINVAL); 2144 2145 mutex_enter(&connp->conn_lock); 2146 if (inlen == 0) { 2147 if (icmp->icmp_filter != NULL) { 2148 kmem_free(icmp->icmp_filter, 2149 sizeof (icmp6_filter_t)); 2150 icmp->icmp_filter = NULL; 2151 } 2152 } else { 2153 if (icmp->icmp_filter == NULL) { 2154 icmp->icmp_filter = kmem_alloc( 2155 sizeof (icmp6_filter_t), 2156 KM_NOSLEEP); 2157 if (icmp->icmp_filter == NULL) { 2158 mutex_exit(&connp->conn_lock); 2159 return (ENOBUFS); 2160 } 2161 } 2162 (void) bcopy(invalp, icmp->icmp_filter, inlen); 2163 } 2164 mutex_exit(&connp->conn_lock); 2165 break; 2166 } 2167 break; 2168 } 2169 error = conn_opt_set(coa, level, name, inlen, invalp, 2170 checkonly, cr); 2171 return (error); 2172 } 2173 2174 /* 2175 * This routine sets socket options. 2176 */ 2177 int 2178 icmp_opt_set(conn_t *connp, uint_t optset_context, int level, int name, 2179 uint_t inlen, uchar_t *invalp, uint_t *outlenp, uchar_t *outvalp, 2180 void *thisdg_attrs, cred_t *cr) 2181 { 2182 icmp_t *icmp = connp->conn_icmp; 2183 int err; 2184 conn_opt_arg_t coas, *coa; 2185 boolean_t checkonly; 2186 icmp_stack_t *is = icmp->icmp_is; 2187 2188 switch (optset_context) { 2189 case SETFN_OPTCOM_CHECKONLY: 2190 checkonly = B_TRUE; 2191 /* 2192 * Note: Implies T_CHECK semantics for T_OPTCOM_REQ 2193 * inlen != 0 implies value supplied and 2194 * we have to "pretend" to set it. 2195 * inlen == 0 implies that there is no 2196 * value part in T_CHECK request and just validation 2197 * done elsewhere should be enough, we just return here. 2198 */ 2199 if (inlen == 0) { 2200 *outlenp = 0; 2201 return (0); 2202 } 2203 break; 2204 case SETFN_OPTCOM_NEGOTIATE: 2205 checkonly = B_FALSE; 2206 break; 2207 case SETFN_UD_NEGOTIATE: 2208 case SETFN_CONN_NEGOTIATE: 2209 checkonly = B_FALSE; 2210 /* 2211 * Negotiating local and "association-related" options 2212 * through T_UNITDATA_REQ. 2213 * 2214 * Following routine can filter out ones we do not 2215 * want to be "set" this way. 2216 */ 2217 if (!icmp_opt_allow_udr_set(level, name)) { 2218 *outlenp = 0; 2219 return (EINVAL); 2220 } 2221 break; 2222 default: 2223 /* 2224 * We should never get here 2225 */ 2226 *outlenp = 0; 2227 return (EINVAL); 2228 } 2229 2230 ASSERT((optset_context != SETFN_OPTCOM_CHECKONLY) || 2231 (optset_context == SETFN_OPTCOM_CHECKONLY && inlen != 0)); 2232 2233 if (thisdg_attrs != NULL) { 2234 /* Options from T_UNITDATA_REQ */ 2235 coa = (conn_opt_arg_t *)thisdg_attrs; 2236 ASSERT(coa->coa_connp == connp); 2237 ASSERT(coa->coa_ixa != NULL); 2238 ASSERT(coa->coa_ipp != NULL); 2239 ASSERT(coa->coa_ancillary); 2240 } else { 2241 coa = &coas; 2242 coas.coa_connp = connp; 2243 /* Get a reference on conn_ixa to prevent concurrent mods */ 2244 coas.coa_ixa = conn_get_ixa(connp, B_TRUE); 2245 if (coas.coa_ixa == NULL) { 2246 *outlenp = 0; 2247 return (ENOMEM); 2248 } 2249 coas.coa_ipp = &connp->conn_xmit_ipp; 2250 coas.coa_ancillary = B_FALSE; 2251 coas.coa_changed = 0; 2252 } 2253 2254 err = icmp_do_opt_set(coa, level, name, inlen, invalp, 2255 cr, checkonly); 2256 if (err != 0) { 2257 if (!coa->coa_ancillary) 2258 ixa_refrele(coa->coa_ixa); 2259 *outlenp = 0; 2260 return (err); 2261 } 2262 2263 /* 2264 * Common case of OK return with outval same as inval. 2265 */ 2266 if (invalp != outvalp) { 2267 /* don't trust bcopy for identical src/dst */ 2268 (void) bcopy(invalp, outvalp, inlen); 2269 } 2270 *outlenp = inlen; 2271 2272 /* 2273 * If this was not ancillary data, then we rebuild the headers, 2274 * update the IRE/NCE, and IPsec as needed. 2275 * Since the label depends on the destination we go through 2276 * ip_set_destination first. 2277 */ 2278 if (coa->coa_ancillary) { 2279 return (0); 2280 } 2281 2282 if (coa->coa_changed & COA_ROUTE_CHANGED) { 2283 in6_addr_t saddr, faddr, nexthop; 2284 in_port_t fport; 2285 2286 /* 2287 * We clear lastdst to make sure we pick up the change 2288 * next time sending. 2289 * If we are connected we re-cache the information. 2290 * We ignore errors to preserve BSD behavior. 2291 * Note that we don't redo IPsec policy lookup here 2292 * since the final destination (or source) didn't change. 2293 */ 2294 mutex_enter(&connp->conn_lock); 2295 connp->conn_v6lastdst = ipv6_all_zeros; 2296 2297 ip_attr_nexthop(coa->coa_ipp, coa->coa_ixa, 2298 &connp->conn_faddr_v6, &nexthop); 2299 saddr = connp->conn_saddr_v6; 2300 faddr = connp->conn_faddr_v6; 2301 fport = connp->conn_fport; 2302 mutex_exit(&connp->conn_lock); 2303 2304 if (!IN6_IS_ADDR_UNSPECIFIED(&faddr) && 2305 !IN6_IS_ADDR_V4MAPPED_ANY(&faddr)) { 2306 (void) ip_attr_connect(connp, coa->coa_ixa, 2307 &saddr, &faddr, &nexthop, fport, NULL, NULL, 2308 IPDF_ALLOW_MCBC | IPDF_VERIFY_DST); 2309 } 2310 } 2311 2312 ixa_refrele(coa->coa_ixa); 2313 2314 if (coa->coa_changed & COA_HEADER_CHANGED) { 2315 /* 2316 * Rebuild the header template if we are connected. 2317 * Otherwise clear conn_v6lastdst so we rebuild the header 2318 * in the data path. 2319 */ 2320 mutex_enter(&connp->conn_lock); 2321 if (!IN6_IS_ADDR_UNSPECIFIED(&connp->conn_faddr_v6) && 2322 !IN6_IS_ADDR_V4MAPPED_ANY(&connp->conn_faddr_v6)) { 2323 err = icmp_build_hdr_template(connp, 2324 &connp->conn_saddr_v6, &connp->conn_faddr_v6, 2325 connp->conn_flowinfo); 2326 if (err != 0) { 2327 mutex_exit(&connp->conn_lock); 2328 return (err); 2329 } 2330 } else { 2331 connp->conn_v6lastdst = ipv6_all_zeros; 2332 } 2333 mutex_exit(&connp->conn_lock); 2334 } 2335 if (coa->coa_changed & COA_RCVBUF_CHANGED) { 2336 (void) proto_set_rx_hiwat(connp->conn_rq, connp, 2337 connp->conn_rcvbuf); 2338 } 2339 if ((coa->coa_changed & COA_SNDBUF_CHANGED) && !IPCL_IS_NONSTR(connp)) { 2340 connp->conn_wq->q_hiwat = connp->conn_sndbuf; 2341 } 2342 if (coa->coa_changed & COA_WROFF_CHANGED) { 2343 /* Increase wroff if needed */ 2344 uint_t wroff; 2345 2346 mutex_enter(&connp->conn_lock); 2347 wroff = connp->conn_ht_iphc_allocated + is->is_wroff_extra; 2348 if (wroff > connp->conn_wroff) { 2349 connp->conn_wroff = wroff; 2350 mutex_exit(&connp->conn_lock); 2351 (void) proto_set_tx_wroff(connp->conn_rq, connp, wroff); 2352 } else { 2353 mutex_exit(&connp->conn_lock); 2354 } 2355 } 2356 if (coa->coa_changed & COA_ICMP_BIND_NEEDED) { 2357 icmp_bind_proto(icmp); 2358 } 2359 return (err); 2360 } 2361 2362 /* This routine sets socket options. */ 2363 int 2364 icmp_tpi_opt_set(queue_t *q, uint_t optset_context, int level, int name, 2365 uint_t inlen, uchar_t *invalp, uint_t *outlenp, uchar_t *outvalp, 2366 void *thisdg_attrs, cred_t *cr) 2367 { 2368 conn_t *connp = Q_TO_CONN(q); 2369 int error; 2370 2371 error = icmp_opt_set(connp, optset_context, level, name, inlen, invalp, 2372 outlenp, outvalp, thisdg_attrs, cr); 2373 return (error); 2374 } 2375 2376 /* 2377 * Setup IP headers. 2378 * 2379 * Note that IP_HDRINCL has ipha_protocol that is different than conn_proto, 2380 * but icmp_output_hdrincl restores ipha_protocol once we return. 2381 */ 2382 mblk_t * 2383 icmp_prepend_hdr(conn_t *connp, ip_xmit_attr_t *ixa, const ip_pkt_t *ipp, 2384 const in6_addr_t *v6src, const in6_addr_t *v6dst, uint32_t flowinfo, 2385 mblk_t *data_mp, int *errorp) 2386 { 2387 mblk_t *mp; 2388 icmp_stack_t *is = connp->conn_netstack->netstack_icmp; 2389 uint_t data_len; 2390 uint32_t cksum; 2391 2392 data_len = msgdsize(data_mp); 2393 mp = conn_prepend_hdr(ixa, ipp, v6src, v6dst, connp->conn_proto, 2394 flowinfo, 0, data_mp, data_len, is->is_wroff_extra, &cksum, errorp); 2395 if (mp == NULL) { 2396 ASSERT(*errorp != 0); 2397 return (NULL); 2398 } 2399 2400 ixa->ixa_pktlen = data_len + ixa->ixa_ip_hdr_length; 2401 2402 /* 2403 * If there was a routing option/header then conn_prepend_hdr 2404 * has massaged it and placed the pseudo-header checksum difference 2405 * in the cksum argument. 2406 * 2407 * Prepare for ICMPv6 checksum done in IP. 2408 * 2409 * We make it easy for IP to include our pseudo header 2410 * by putting our length (and any routing header adjustment) 2411 * in the ICMPv6 checksum field. 2412 * The IP source, destination, and length have already been set by 2413 * conn_prepend_hdr. 2414 */ 2415 cksum += data_len; 2416 cksum = (cksum >> 16) + (cksum & 0xFFFF); 2417 ASSERT(cksum < 0x10000); 2418 2419 if (ixa->ixa_flags & IXAF_IS_IPV4) { 2420 ipha_t *ipha = (ipha_t *)mp->b_rptr; 2421 2422 ASSERT(ntohs(ipha->ipha_length) == ixa->ixa_pktlen); 2423 } else { 2424 ip6_t *ip6h = (ip6_t *)mp->b_rptr; 2425 uint_t cksum_offset = 0; 2426 2427 ASSERT(ntohs(ip6h->ip6_plen) + IPV6_HDR_LEN == ixa->ixa_pktlen); 2428 2429 if (ixa->ixa_flags & IXAF_SET_ULP_CKSUM) { 2430 if (connp->conn_proto == IPPROTO_ICMPV6) { 2431 cksum_offset = ixa->ixa_ip_hdr_length + 2432 offsetof(icmp6_t, icmp6_cksum); 2433 } else if (ixa->ixa_flags & IXAF_SET_RAW_CKSUM) { 2434 cksum_offset = ixa->ixa_ip_hdr_length + 2435 ixa->ixa_raw_cksum_offset; 2436 } 2437 } 2438 if (cksum_offset != 0) { 2439 uint16_t *ptr; 2440 2441 /* Make sure the checksum fits in the first mblk */ 2442 if (cksum_offset + sizeof (short) > MBLKL(mp)) { 2443 mblk_t *mp1; 2444 2445 mp1 = msgpullup(mp, 2446 cksum_offset + sizeof (short)); 2447 freemsg(mp); 2448 if (mp1 == NULL) { 2449 *errorp = ENOMEM; 2450 return (NULL); 2451 } 2452 mp = mp1; 2453 ip6h = (ip6_t *)mp->b_rptr; 2454 } 2455 ptr = (uint16_t *)(mp->b_rptr + cksum_offset); 2456 *ptr = htons(cksum); 2457 } 2458 } 2459 2460 /* Note that we don't try to update wroff due to ancillary data */ 2461 return (mp); 2462 } 2463 2464 static int 2465 icmp_build_hdr_template(conn_t *connp, const in6_addr_t *v6src, 2466 const in6_addr_t *v6dst, uint32_t flowinfo) 2467 { 2468 int error; 2469 2470 ASSERT(MUTEX_HELD(&connp->conn_lock)); 2471 /* 2472 * We clear lastdst to make sure we don't use the lastdst path 2473 * next time sending since we might not have set v6dst yet. 2474 */ 2475 connp->conn_v6lastdst = ipv6_all_zeros; 2476 2477 error = conn_build_hdr_template(connp, 0, 0, v6src, v6dst, flowinfo); 2478 if (error != 0) 2479 return (error); 2480 2481 /* 2482 * Any routing header/option has been massaged. The checksum difference 2483 * is stored in conn_sum. 2484 */ 2485 return (0); 2486 } 2487 2488 static mblk_t * 2489 icmp_queue_fallback(icmp_t *icmp, mblk_t *mp) 2490 { 2491 ASSERT(MUTEX_HELD(&icmp->icmp_recv_lock)); 2492 if (IPCL_IS_NONSTR(icmp->icmp_connp)) { 2493 /* 2494 * fallback has started but messages have not been moved yet 2495 */ 2496 if (icmp->icmp_fallback_queue_head == NULL) { 2497 ASSERT(icmp->icmp_fallback_queue_tail == NULL); 2498 icmp->icmp_fallback_queue_head = mp; 2499 icmp->icmp_fallback_queue_tail = mp; 2500 } else { 2501 ASSERT(icmp->icmp_fallback_queue_tail != NULL); 2502 icmp->icmp_fallback_queue_tail->b_next = mp; 2503 icmp->icmp_fallback_queue_tail = mp; 2504 } 2505 return (NULL); 2506 } else { 2507 /* 2508 * Fallback completed, let the caller putnext() the mblk. 2509 */ 2510 return (mp); 2511 } 2512 } 2513 2514 /* 2515 * Deliver data to ULP. In case we have a socket, and it's falling back to 2516 * TPI, then we'll queue the mp for later processing. 2517 */ 2518 static void 2519 icmp_ulp_recv(conn_t *connp, mblk_t *mp, uint_t len) 2520 { 2521 if (IPCL_IS_NONSTR(connp)) { 2522 icmp_t *icmp = connp->conn_icmp; 2523 int error; 2524 2525 ASSERT(len == msgdsize(mp)); 2526 if ((*connp->conn_upcalls->su_recv) 2527 (connp->conn_upper_handle, mp, len, 0, &error, NULL) < 0) { 2528 mutex_enter(&icmp->icmp_recv_lock); 2529 if (error == ENOSPC) { 2530 /* 2531 * let's confirm while holding the lock 2532 */ 2533 if ((*connp->conn_upcalls->su_recv) 2534 (connp->conn_upper_handle, NULL, 0, 0, 2535 &error, NULL) < 0) { 2536 ASSERT(error == ENOSPC); 2537 if (error == ENOSPC) { 2538 connp->conn_flow_cntrld = 2539 B_TRUE; 2540 } 2541 } 2542 mutex_exit(&icmp->icmp_recv_lock); 2543 } else { 2544 ASSERT(error == EOPNOTSUPP); 2545 mp = icmp_queue_fallback(icmp, mp); 2546 mutex_exit(&icmp->icmp_recv_lock); 2547 if (mp != NULL) 2548 putnext(connp->conn_rq, mp); 2549 } 2550 } 2551 ASSERT(MUTEX_NOT_HELD(&icmp->icmp_recv_lock)); 2552 } else { 2553 putnext(connp->conn_rq, mp); 2554 } 2555 } 2556 2557 /* 2558 * This is the inbound data path. 2559 * IP has already pulled up the IP headers and verified alignment 2560 * etc. 2561 */ 2562 /* ARGSUSED2 */ 2563 static void 2564 icmp_input(void *arg1, mblk_t *mp, void *arg2, ip_recv_attr_t *ira) 2565 { 2566 conn_t *connp = (conn_t *)arg1; 2567 struct T_unitdata_ind *tudi; 2568 uchar_t *rptr; /* Pointer to IP header */ 2569 int ip_hdr_length; 2570 int udi_size; /* Size of T_unitdata_ind */ 2571 int pkt_len; 2572 icmp_t *icmp; 2573 ip_pkt_t ipps; 2574 ip6_t *ip6h; 2575 mblk_t *mp1; 2576 crb_t recv_ancillary; 2577 icmp_stack_t *is; 2578 sin_t *sin; 2579 sin6_t *sin6; 2580 ipha_t *ipha; 2581 2582 ASSERT(connp->conn_flags & IPCL_RAWIPCONN); 2583 2584 icmp = connp->conn_icmp; 2585 is = icmp->icmp_is; 2586 rptr = mp->b_rptr; 2587 2588 ASSERT(DB_TYPE(mp) == M_DATA); 2589 ASSERT(OK_32PTR(rptr)); 2590 ASSERT(ira->ira_pktlen == msgdsize(mp)); 2591 pkt_len = ira->ira_pktlen; 2592 2593 /* 2594 * Get a snapshot of these and allow other threads to change 2595 * them after that. We need the same recv_ancillary when determining 2596 * the size as when adding the ancillary data items. 2597 */ 2598 mutex_enter(&connp->conn_lock); 2599 recv_ancillary = connp->conn_recv_ancillary; 2600 mutex_exit(&connp->conn_lock); 2601 2602 ip_hdr_length = ira->ira_ip_hdr_length; 2603 ASSERT(MBLKL(mp) >= ip_hdr_length); /* IP did a pullup */ 2604 2605 /* Initialize regardless of IP version */ 2606 ipps.ipp_fields = 0; 2607 2608 if (ira->ira_flags & IRAF_IS_IPV4) { 2609 ASSERT(IPH_HDR_VERSION(rptr) == IPV4_VERSION); 2610 ASSERT(MBLKL(mp) >= sizeof (ipha_t)); 2611 ASSERT(ira->ira_ip_hdr_length == IPH_HDR_LENGTH(rptr)); 2612 2613 ipha = (ipha_t *)mp->b_rptr; 2614 if (recv_ancillary.crb_all != 0) 2615 (void) ip_find_hdr_v4(ipha, &ipps, B_FALSE); 2616 2617 /* 2618 * BSD for some reason adjusts ipha_length to exclude the 2619 * IP header length. We do the same. 2620 */ 2621 if (is->is_bsd_compat) { 2622 ushort_t len; 2623 2624 len = ntohs(ipha->ipha_length); 2625 if (mp->b_datap->db_ref > 1) { 2626 /* 2627 * Allocate a new IP header so that we can 2628 * modify ipha_length. 2629 */ 2630 mblk_t *mp1; 2631 2632 mp1 = allocb(ip_hdr_length, BPRI_MED); 2633 if (mp1 == NULL) { 2634 freemsg(mp); 2635 BUMP_MIB(&is->is_rawip_mib, 2636 rawipInErrors); 2637 return; 2638 } 2639 bcopy(rptr, mp1->b_rptr, ip_hdr_length); 2640 mp->b_rptr = rptr + ip_hdr_length; 2641 rptr = mp1->b_rptr; 2642 ipha = (ipha_t *)rptr; 2643 mp1->b_cont = mp; 2644 mp1->b_wptr = rptr + ip_hdr_length; 2645 mp = mp1; 2646 } 2647 len -= ip_hdr_length; 2648 ipha->ipha_length = htons(len); 2649 } 2650 2651 /* 2652 * For RAW sockets we not pass ICMP/IPv4 packets to AF_INET6 2653 * sockets. This is ensured by icmp_bind and the IP fanout code. 2654 */ 2655 ASSERT(connp->conn_family == AF_INET); 2656 2657 /* 2658 * This is the inbound data path. Packets are passed upstream 2659 * as T_UNITDATA_IND messages with full IPv4 headers still 2660 * attached. 2661 */ 2662 2663 /* 2664 * Normally only send up the source address. 2665 * If any ancillary data items are wanted we add those. 2666 */ 2667 udi_size = sizeof (struct T_unitdata_ind) + sizeof (sin_t); 2668 if (recv_ancillary.crb_all != 0) { 2669 udi_size += conn_recvancillary_size(connp, 2670 recv_ancillary, ira, mp, &ipps); 2671 } 2672 2673 /* Allocate a message block for the T_UNITDATA_IND structure. */ 2674 mp1 = allocb(udi_size, BPRI_MED); 2675 if (mp1 == NULL) { 2676 freemsg(mp); 2677 BUMP_MIB(&is->is_rawip_mib, rawipInErrors); 2678 return; 2679 } 2680 mp1->b_cont = mp; 2681 tudi = (struct T_unitdata_ind *)mp1->b_rptr; 2682 mp1->b_datap->db_type = M_PROTO; 2683 mp1->b_wptr = (uchar_t *)tudi + udi_size; 2684 tudi->PRIM_type = T_UNITDATA_IND; 2685 tudi->SRC_length = sizeof (sin_t); 2686 tudi->SRC_offset = sizeof (struct T_unitdata_ind); 2687 sin = (sin_t *)&tudi[1]; 2688 *sin = sin_null; 2689 sin->sin_family = AF_INET; 2690 sin->sin_addr.s_addr = ipha->ipha_src; 2691 *(uint32_t *)&sin->sin_zero[0] = 0; 2692 *(uint32_t *)&sin->sin_zero[4] = 0; 2693 tudi->OPT_offset = sizeof (struct T_unitdata_ind) + 2694 sizeof (sin_t); 2695 udi_size -= (sizeof (struct T_unitdata_ind) + sizeof (sin_t)); 2696 tudi->OPT_length = udi_size; 2697 2698 /* 2699 * Add options if IP_RECVIF etc is set 2700 */ 2701 if (udi_size != 0) { 2702 conn_recvancillary_add(connp, recv_ancillary, ira, 2703 &ipps, (uchar_t *)&sin[1], udi_size); 2704 } 2705 goto deliver; 2706 } 2707 2708 ASSERT(IPH_HDR_VERSION(rptr) == IPV6_VERSION); 2709 /* 2710 * IPv6 packets can only be received by applications 2711 * that are prepared to receive IPv6 addresses. 2712 * The IP fanout must ensure this. 2713 */ 2714 ASSERT(connp->conn_family == AF_INET6); 2715 2716 /* 2717 * Handle IPv6 packets. We don't pass up the IP headers with the 2718 * payload for IPv6. 2719 */ 2720 2721 ip6h = (ip6_t *)rptr; 2722 if (recv_ancillary.crb_all != 0) { 2723 /* 2724 * Call on ip_find_hdr_v6 which gets individual lenghts of 2725 * extension headers (and pointers to them). 2726 */ 2727 uint8_t nexthdr; 2728 2729 /* We don't care about the length or nextheader. */ 2730 (void) ip_find_hdr_v6(mp, ip6h, B_TRUE, &ipps, &nexthdr); 2731 2732 /* 2733 * We do not pass up hop-by-hop options or any other 2734 * extension header as part of the packet. Applications 2735 * that want to see them have to specify IPV6_RECV* socket 2736 * options. And conn_recvancillary_size/add explicitly 2737 * drops the TX option from IPV6_HOPOPTS as it does for UDP. 2738 * 2739 * If we had multilevel ICMP sockets, then we'd want to 2740 * modify conn_recvancillary_size/add to 2741 * allow the user to see the label. 2742 */ 2743 } 2744 2745 /* 2746 * Check a filter for ICMPv6 types if needed. 2747 * Verify raw checksums if needed. 2748 */ 2749 mutex_enter(&connp->conn_lock); 2750 if (icmp->icmp_filter != NULL) { 2751 int type; 2752 2753 /* Assumes that IP has done the pullupmsg */ 2754 type = mp->b_rptr[ip_hdr_length]; 2755 2756 ASSERT(mp->b_rptr + ip_hdr_length <= mp->b_wptr); 2757 if (ICMP6_FILTER_WILLBLOCK(type, icmp->icmp_filter)) { 2758 mutex_exit(&connp->conn_lock); 2759 freemsg(mp); 2760 return; 2761 } 2762 } 2763 if (connp->conn_ixa->ixa_flags & IXAF_SET_RAW_CKSUM) { 2764 /* Checksum */ 2765 uint16_t *up; 2766 uint32_t sum; 2767 int remlen; 2768 2769 up = (uint16_t *)&ip6h->ip6_src; 2770 2771 remlen = msgdsize(mp) - ip_hdr_length; 2772 sum = htons(connp->conn_proto + remlen) 2773 + up[0] + up[1] + up[2] + up[3] 2774 + up[4] + up[5] + up[6] + up[7] 2775 + up[8] + up[9] + up[10] + up[11] 2776 + up[12] + up[13] + up[14] + up[15]; 2777 sum = (sum & 0xffff) + (sum >> 16); 2778 sum = IP_CSUM(mp, ip_hdr_length, sum); 2779 if (sum != 0) { 2780 /* IPv6 RAW checksum failed */ 2781 ip0dbg(("icmp_rput: RAW checksum failed %x\n", sum)); 2782 mutex_exit(&connp->conn_lock); 2783 freemsg(mp); 2784 BUMP_MIB(&is->is_rawip_mib, rawipInCksumErrs); 2785 return; 2786 } 2787 } 2788 mutex_exit(&connp->conn_lock); 2789 2790 udi_size = sizeof (struct T_unitdata_ind) + sizeof (sin6_t); 2791 2792 if (recv_ancillary.crb_all != 0) { 2793 udi_size += conn_recvancillary_size(connp, 2794 recv_ancillary, ira, mp, &ipps); 2795 } 2796 2797 mp1 = allocb(udi_size, BPRI_MED); 2798 if (mp1 == NULL) { 2799 freemsg(mp); 2800 BUMP_MIB(&is->is_rawip_mib, rawipInErrors); 2801 return; 2802 } 2803 mp1->b_cont = mp; 2804 mp1->b_datap->db_type = M_PROTO; 2805 tudi = (struct T_unitdata_ind *)mp1->b_rptr; 2806 mp1->b_wptr = (uchar_t *)tudi + udi_size; 2807 tudi->PRIM_type = T_UNITDATA_IND; 2808 tudi->SRC_length = sizeof (sin6_t); 2809 tudi->SRC_offset = sizeof (struct T_unitdata_ind); 2810 tudi->OPT_offset = sizeof (struct T_unitdata_ind) + sizeof (sin6_t); 2811 udi_size -= (sizeof (struct T_unitdata_ind) + sizeof (sin6_t)); 2812 tudi->OPT_length = udi_size; 2813 sin6 = (sin6_t *)&tudi[1]; 2814 *sin6 = sin6_null; 2815 sin6->sin6_port = 0; 2816 sin6->sin6_family = AF_INET6; 2817 2818 sin6->sin6_addr = ip6h->ip6_src; 2819 /* No sin6_flowinfo per API */ 2820 sin6->sin6_flowinfo = 0; 2821 /* For link-scope pass up scope id */ 2822 if (IN6_IS_ADDR_LINKSCOPE(&ip6h->ip6_src)) 2823 sin6->sin6_scope_id = ira->ira_ruifindex; 2824 else 2825 sin6->sin6_scope_id = 0; 2826 sin6->__sin6_src_id = ip_srcid_find_addr(&ip6h->ip6_dst, 2827 IPCL_ZONEID(connp), is->is_netstack); 2828 2829 if (udi_size != 0) { 2830 conn_recvancillary_add(connp, recv_ancillary, ira, 2831 &ipps, (uchar_t *)&sin6[1], udi_size); 2832 } 2833 2834 /* Skip all the IPv6 headers per API */ 2835 mp->b_rptr += ip_hdr_length; 2836 pkt_len -= ip_hdr_length; 2837 2838 deliver: 2839 BUMP_MIB(&is->is_rawip_mib, rawipInDatagrams); 2840 icmp_ulp_recv(connp, mp1, pkt_len); 2841 } 2842 2843 /* 2844 * return SNMP stuff in buffer in mpdata. We don't hold any lock and report 2845 * information that can be changing beneath us. 2846 */ 2847 mblk_t * 2848 icmp_snmp_get(queue_t *q, mblk_t *mpctl) 2849 { 2850 mblk_t *mpdata; 2851 struct opthdr *optp; 2852 conn_t *connp = Q_TO_CONN(q); 2853 icmp_stack_t *is = connp->conn_netstack->netstack_icmp; 2854 mblk_t *mp2ctl; 2855 2856 /* 2857 * make a copy of the original message 2858 */ 2859 mp2ctl = copymsg(mpctl); 2860 2861 if (mpctl == NULL || 2862 (mpdata = mpctl->b_cont) == NULL) { 2863 freemsg(mpctl); 2864 freemsg(mp2ctl); 2865 return (0); 2866 } 2867 2868 /* fixed length structure for IPv4 and IPv6 counters */ 2869 optp = (struct opthdr *)&mpctl->b_rptr[sizeof (struct T_optmgmt_ack)]; 2870 optp->level = EXPER_RAWIP; 2871 optp->name = 0; 2872 (void) snmp_append_data(mpdata, (char *)&is->is_rawip_mib, 2873 sizeof (is->is_rawip_mib)); 2874 optp->len = msgdsize(mpdata); 2875 qreply(q, mpctl); 2876 2877 return (mp2ctl); 2878 } 2879 2880 /* 2881 * Return 0 if invalid set request, 1 otherwise, including non-rawip requests. 2882 * TODO: If this ever actually tries to set anything, it needs to be 2883 * to do the appropriate locking. 2884 */ 2885 /* ARGSUSED */ 2886 int 2887 icmp_snmp_set(queue_t *q, t_scalar_t level, t_scalar_t name, 2888 uchar_t *ptr, int len) 2889 { 2890 switch (level) { 2891 case EXPER_RAWIP: 2892 return (0); 2893 default: 2894 return (1); 2895 } 2896 } 2897 2898 /* 2899 * This routine creates a T_UDERROR_IND message and passes it upstream. 2900 * The address and options are copied from the T_UNITDATA_REQ message 2901 * passed in mp. This message is freed. 2902 */ 2903 static void 2904 icmp_ud_err(queue_t *q, mblk_t *mp, t_scalar_t err) 2905 { 2906 struct T_unitdata_req *tudr; 2907 mblk_t *mp1; 2908 uchar_t *destaddr; 2909 t_scalar_t destlen; 2910 uchar_t *optaddr; 2911 t_scalar_t optlen; 2912 2913 if ((mp->b_wptr < mp->b_rptr) || 2914 (MBLKL(mp)) < sizeof (struct T_unitdata_req)) { 2915 goto done; 2916 } 2917 tudr = (struct T_unitdata_req *)mp->b_rptr; 2918 destaddr = mp->b_rptr + tudr->DEST_offset; 2919 if (destaddr < mp->b_rptr || destaddr >= mp->b_wptr || 2920 destaddr + tudr->DEST_length < mp->b_rptr || 2921 destaddr + tudr->DEST_length > mp->b_wptr) { 2922 goto done; 2923 } 2924 optaddr = mp->b_rptr + tudr->OPT_offset; 2925 if (optaddr < mp->b_rptr || optaddr >= mp->b_wptr || 2926 optaddr + tudr->OPT_length < mp->b_rptr || 2927 optaddr + tudr->OPT_length > mp->b_wptr) { 2928 goto done; 2929 } 2930 destlen = tudr->DEST_length; 2931 optlen = tudr->OPT_length; 2932 2933 mp1 = mi_tpi_uderror_ind((char *)destaddr, destlen, 2934 (char *)optaddr, optlen, err); 2935 if (mp1 != NULL) 2936 qreply(q, mp1); 2937 2938 done: 2939 freemsg(mp); 2940 } 2941 2942 static int 2943 rawip_do_unbind(conn_t *connp) 2944 { 2945 icmp_t *icmp = connp->conn_icmp; 2946 2947 mutex_enter(&connp->conn_lock); 2948 /* If a bind has not been done, we can't unbind. */ 2949 if (icmp->icmp_state == TS_UNBND) { 2950 mutex_exit(&connp->conn_lock); 2951 return (-TOUTSTATE); 2952 } 2953 connp->conn_saddr_v6 = ipv6_all_zeros; 2954 connp->conn_bound_addr_v6 = ipv6_all_zeros; 2955 connp->conn_laddr_v6 = ipv6_all_zeros; 2956 connp->conn_mcbc_bind = B_FALSE; 2957 connp->conn_lport = 0; 2958 connp->conn_fport = 0; 2959 /* In case we were also connected */ 2960 connp->conn_faddr_v6 = ipv6_all_zeros; 2961 connp->conn_v6lastdst = ipv6_all_zeros; 2962 2963 icmp->icmp_state = TS_UNBND; 2964 2965 (void) icmp_build_hdr_template(connp, &connp->conn_saddr_v6, 2966 &connp->conn_faddr_v6, connp->conn_flowinfo); 2967 mutex_exit(&connp->conn_lock); 2968 2969 ip_unbind(connp); 2970 return (0); 2971 } 2972 2973 /* 2974 * This routine is called by icmp_wput to handle T_UNBIND_REQ messages. 2975 * After some error checking, the message is passed downstream to ip. 2976 */ 2977 static void 2978 icmp_tpi_unbind(queue_t *q, mblk_t *mp) 2979 { 2980 conn_t *connp = Q_TO_CONN(q); 2981 int error; 2982 2983 ASSERT(mp->b_cont == NULL); 2984 error = rawip_do_unbind(connp); 2985 if (error) { 2986 if (error < 0) { 2987 icmp_err_ack(q, mp, -error, 0); 2988 } else { 2989 icmp_err_ack(q, mp, 0, error); 2990 } 2991 return; 2992 } 2993 2994 /* 2995 * Convert mp into a T_OK_ACK 2996 */ 2997 2998 mp = mi_tpi_ok_ack_alloc(mp); 2999 3000 /* 3001 * should not happen in practice... T_OK_ACK is smaller than the 3002 * original message. 3003 */ 3004 ASSERT(mp != NULL); 3005 ASSERT(((struct T_ok_ack *)mp->b_rptr)->PRIM_type == T_OK_ACK); 3006 qreply(q, mp); 3007 } 3008 3009 /* 3010 * Process IPv4 packets that already include an IP header. 3011 * Used when IP_HDRINCL has been set (implicit for IPPROTO_RAW and 3012 * IPPROTO_IGMP). 3013 * In this case we ignore the address and any options in the T_UNITDATA_REQ. 3014 * 3015 * The packet is assumed to have a base (20 byte) IP header followed 3016 * by the upper-layer protocol. We include any IP_OPTIONS including a 3017 * CIPSO label but otherwise preserve the base IP header. 3018 */ 3019 static int 3020 icmp_output_hdrincl(conn_t *connp, mblk_t *mp, cred_t *cr, pid_t pid) 3021 { 3022 icmp_t *icmp = connp->conn_icmp; 3023 icmp_stack_t *is = icmp->icmp_is; 3024 ipha_t iphas; 3025 ipha_t *ipha; 3026 int ip_hdr_length; 3027 int tp_hdr_len; 3028 ip_xmit_attr_t *ixa; 3029 ip_pkt_t *ipp; 3030 in6_addr_t v6src; 3031 in6_addr_t v6dst; 3032 in6_addr_t v6nexthop; 3033 int error; 3034 boolean_t do_ipsec; 3035 3036 /* 3037 * We need an exclusive copy of conn_ixa since the included IP 3038 * header could have any destination. 3039 * That copy has no pointers hence we 3040 * need to set them up once we've parsed the ancillary data. 3041 */ 3042 ixa = conn_get_ixa_exclusive(connp); 3043 if (ixa == NULL) { 3044 BUMP_MIB(&is->is_rawip_mib, rawipOutErrors); 3045 freemsg(mp); 3046 return (ENOMEM); 3047 } 3048 ASSERT(cr != NULL); 3049 /* 3050 * Caller has a reference on cr; from db_credp or because we 3051 * are running in process context. 3052 */ 3053 ASSERT(!(ixa->ixa_free_flags & IXA_FREE_CRED)); 3054 ixa->ixa_cred = cr; 3055 ixa->ixa_cpid = pid; 3056 if (is_system_labeled()) { 3057 /* We need to restart with a label based on the cred */ 3058 ip_xmit_attr_restore_tsl(ixa, ixa->ixa_cred); 3059 } 3060 3061 /* In case previous destination was multicast or multirt */ 3062 ip_attr_newdst(ixa); 3063 3064 /* Get a copy of conn_xmit_ipp since the TX label might change it */ 3065 ipp = kmem_zalloc(sizeof (*ipp), KM_NOSLEEP); 3066 if (ipp == NULL) { 3067 ASSERT(!(ixa->ixa_free_flags & IXA_FREE_CRED)); 3068 ixa->ixa_cred = connp->conn_cred; /* Restore */ 3069 ixa->ixa_cpid = connp->conn_cpid; 3070 ixa_refrele(ixa); 3071 BUMP_MIB(&is->is_rawip_mib, rawipOutErrors); 3072 freemsg(mp); 3073 return (ENOMEM); 3074 } 3075 mutex_enter(&connp->conn_lock); 3076 error = ip_pkt_copy(&connp->conn_xmit_ipp, ipp, KM_NOSLEEP); 3077 mutex_exit(&connp->conn_lock); 3078 if (error != 0) { 3079 BUMP_MIB(&is->is_rawip_mib, rawipOutErrors); 3080 freemsg(mp); 3081 goto done; 3082 } 3083 3084 /* Sanity check length of packet */ 3085 ipha = (ipha_t *)mp->b_rptr; 3086 3087 ip_hdr_length = IP_SIMPLE_HDR_LENGTH; 3088 if ((mp->b_wptr - mp->b_rptr) < IP_SIMPLE_HDR_LENGTH) { 3089 if (!pullupmsg(mp, IP_SIMPLE_HDR_LENGTH)) { 3090 BUMP_MIB(&is->is_rawip_mib, rawipOutErrors); 3091 freemsg(mp); 3092 goto done; 3093 } 3094 ipha = (ipha_t *)mp->b_rptr; 3095 } 3096 ipha->ipha_version_and_hdr_length = 3097 (IP_VERSION<<4) | (ip_hdr_length>>2); 3098 3099 /* 3100 * We set IXAF_DONTFRAG if the application set DF which makes 3101 * IP not fragment. 3102 */ 3103 ipha->ipha_fragment_offset_and_flags &= htons(IPH_DF); 3104 if (ipha->ipha_fragment_offset_and_flags & htons(IPH_DF)) 3105 ixa->ixa_flags |= (IXAF_DONTFRAG | IXAF_PMTU_IPV4_DF); 3106 else 3107 ixa->ixa_flags &= ~(IXAF_DONTFRAG | IXAF_PMTU_IPV4_DF); 3108 3109 /* Even for multicast and broadcast we honor the apps ttl */ 3110 ixa->ixa_flags |= IXAF_NO_TTL_CHANGE; 3111 3112 /* 3113 * No source verification for non-local addresses 3114 */ 3115 if (ipha->ipha_src != INADDR_ANY && 3116 ip_laddr_verify_v4(ipha->ipha_src, ixa->ixa_zoneid, 3117 is->is_netstack->netstack_ip, B_FALSE) 3118 != IPVL_UNICAST_UP) { 3119 ixa->ixa_flags &= ~IXAF_VERIFY_SOURCE; 3120 } 3121 3122 if (ipha->ipha_dst == INADDR_ANY) 3123 ipha->ipha_dst = htonl(INADDR_LOOPBACK); 3124 3125 IN6_IPADDR_TO_V4MAPPED(ipha->ipha_src, &v6src); 3126 IN6_IPADDR_TO_V4MAPPED(ipha->ipha_dst, &v6dst); 3127 3128 /* Defer IPsec if it might need to look at ICMP type/code */ 3129 do_ipsec = ipha->ipha_protocol != IPPROTO_ICMP; 3130 ixa->ixa_flags |= IXAF_IS_IPV4; 3131 3132 ip_attr_nexthop(ipp, ixa, &v6dst, &v6nexthop); 3133 error = ip_attr_connect(connp, ixa, &v6src, &v6dst, &v6nexthop, 3134 connp->conn_fport, &v6src, NULL, IPDF_ALLOW_MCBC | IPDF_VERIFY_DST | 3135 (do_ipsec ? IPDF_IPSEC : 0)); 3136 switch (error) { 3137 case 0: 3138 break; 3139 case EADDRNOTAVAIL: 3140 /* 3141 * IXAF_VERIFY_SOURCE tells us to pick a better source. 3142 * Don't have the application see that errno 3143 */ 3144 error = ENETUNREACH; 3145 goto failed; 3146 case ENETDOWN: 3147 /* 3148 * Have !ipif_addr_ready address; drop packet silently 3149 * until we can get applications to not send until we 3150 * are ready. 3151 */ 3152 error = 0; 3153 goto failed; 3154 case EHOSTUNREACH: 3155 case ENETUNREACH: 3156 if (ixa->ixa_ire != NULL) { 3157 /* 3158 * Let conn_ip_output/ire_send_noroute return 3159 * the error and send any local ICMP error. 3160 */ 3161 error = 0; 3162 break; 3163 } 3164 /* FALLTHRU */ 3165 default: 3166 failed: 3167 freemsg(mp); 3168 BUMP_MIB(&is->is_rawip_mib, rawipOutErrors); 3169 goto done; 3170 } 3171 if (ipha->ipha_src == INADDR_ANY) 3172 IN6_V4MAPPED_TO_IPADDR(&v6src, ipha->ipha_src); 3173 3174 /* 3175 * We might be going to a different destination than last time, 3176 * thus check that TX allows the communication and compute any 3177 * needed label. 3178 * 3179 * TSOL Note: We have an exclusive ipp and ixa for this thread so we 3180 * don't have to worry about concurrent threads. 3181 */ 3182 if (is_system_labeled()) { 3183 /* 3184 * Check whether Trusted Solaris policy allows communication 3185 * with this host, and pretend that the destination is 3186 * unreachable if not. 3187 * Compute any needed label and place it in ipp_label_v4/v6. 3188 * 3189 * Later conn_build_hdr_template/conn_prepend_hdr takes 3190 * ipp_label_v4/v6 to form the packet. 3191 * 3192 * Tsol note: We have ipp structure local to this thread so 3193 * no locking is needed. 3194 */ 3195 error = conn_update_label(connp, ixa, &v6dst, ipp); 3196 if (error != 0) { 3197 freemsg(mp); 3198 BUMP_MIB(&is->is_rawip_mib, rawipOutErrors); 3199 goto done; 3200 } 3201 } 3202 3203 /* 3204 * Save away a copy of the IPv4 header the application passed down 3205 * and then prepend an IPv4 header complete with any IP options 3206 * including label. 3207 * We need a struct copy since icmp_prepend_hdr will reuse the available 3208 * space in the mblk. 3209 */ 3210 iphas = *ipha; 3211 mp->b_rptr += IP_SIMPLE_HDR_LENGTH; 3212 3213 mp = icmp_prepend_hdr(connp, ixa, ipp, &v6src, &v6dst, 0, mp, &error); 3214 if (mp == NULL) { 3215 BUMP_MIB(&is->is_rawip_mib, rawipOutErrors); 3216 ASSERT(error != 0); 3217 goto done; 3218 } 3219 if (ixa->ixa_pktlen > IP_MAXPACKET) { 3220 error = EMSGSIZE; 3221 BUMP_MIB(&is->is_rawip_mib, rawipOutErrors); 3222 freemsg(mp); 3223 goto done; 3224 } 3225 /* Restore key parts of the header that the application passed down */ 3226 ipha = (ipha_t *)mp->b_rptr; 3227 ipha->ipha_type_of_service = iphas.ipha_type_of_service; 3228 ipha->ipha_ident = iphas.ipha_ident; 3229 ipha->ipha_fragment_offset_and_flags = 3230 iphas.ipha_fragment_offset_and_flags; 3231 ipha->ipha_ttl = iphas.ipha_ttl; 3232 ipha->ipha_protocol = iphas.ipha_protocol; 3233 ipha->ipha_src = iphas.ipha_src; 3234 ipha->ipha_dst = iphas.ipha_dst; 3235 3236 ixa->ixa_protocol = ipha->ipha_protocol; 3237 3238 /* 3239 * Make sure that the IP header plus any transport header that is 3240 * checksumed by ip_output is in the first mblk. (ip_output assumes 3241 * that at least the checksum field is in the first mblk.) 3242 */ 3243 switch (ipha->ipha_protocol) { 3244 case IPPROTO_UDP: 3245 tp_hdr_len = 8; 3246 break; 3247 case IPPROTO_TCP: 3248 tp_hdr_len = 20; 3249 break; 3250 default: 3251 tp_hdr_len = 0; 3252 break; 3253 } 3254 ip_hdr_length = IPH_HDR_LENGTH(ipha); 3255 if (mp->b_wptr - mp->b_rptr < ip_hdr_length + tp_hdr_len) { 3256 if (!pullupmsg(mp, ip_hdr_length + tp_hdr_len)) { 3257 BUMP_MIB(&is->is_rawip_mib, rawipOutErrors); 3258 if (mp->b_cont == NULL) 3259 error = EINVAL; 3260 else 3261 error = ENOMEM; 3262 freemsg(mp); 3263 goto done; 3264 } 3265 } 3266 3267 if (!do_ipsec) { 3268 /* Policy might differ for different ICMP type/code */ 3269 if (ixa->ixa_ipsec_policy != NULL) { 3270 IPPOL_REFRELE(ixa->ixa_ipsec_policy); 3271 ixa->ixa_ipsec_policy = NULL; 3272 ixa->ixa_flags &= ~IXAF_IPSEC_SECURE; 3273 } 3274 mp = ip_output_attach_policy(mp, ipha, NULL, connp, ixa); 3275 if (mp == NULL) { 3276 BUMP_MIB(&is->is_rawip_mib, rawipOutErrors); 3277 error = EHOSTUNREACH; /* IPsec policy failure */ 3278 goto done; 3279 } 3280 } 3281 3282 /* We're done. Pass the packet to ip. */ 3283 BUMP_MIB(&is->is_rawip_mib, rawipOutDatagrams); 3284 3285 error = conn_ip_output(mp, ixa); 3286 /* No rawipOutErrors if an error since IP increases its error counter */ 3287 switch (error) { 3288 case 0: 3289 break; 3290 case EWOULDBLOCK: 3291 (void) ixa_check_drain_insert(connp, ixa); 3292 error = 0; 3293 break; 3294 case EADDRNOTAVAIL: 3295 /* 3296 * IXAF_VERIFY_SOURCE tells us to pick a better source. 3297 * Don't have the application see that errno 3298 */ 3299 error = ENETUNREACH; 3300 break; 3301 } 3302 done: 3303 ASSERT(!(ixa->ixa_free_flags & IXA_FREE_CRED)); 3304 ixa->ixa_cred = connp->conn_cred; /* Restore */ 3305 ixa->ixa_cpid = connp->conn_cpid; 3306 ixa_refrele(ixa); 3307 ip_pkt_free(ipp); 3308 kmem_free(ipp, sizeof (*ipp)); 3309 return (error); 3310 } 3311 3312 static mblk_t * 3313 icmp_output_attach_policy(mblk_t *mp, conn_t *connp, ip_xmit_attr_t *ixa) 3314 { 3315 ipha_t *ipha = NULL; 3316 ip6_t *ip6h = NULL; 3317 3318 if (ixa->ixa_flags & IXAF_IS_IPV4) 3319 ipha = (ipha_t *)mp->b_rptr; 3320 else 3321 ip6h = (ip6_t *)mp->b_rptr; 3322 3323 if (ixa->ixa_ipsec_policy != NULL) { 3324 IPPOL_REFRELE(ixa->ixa_ipsec_policy); 3325 ixa->ixa_ipsec_policy = NULL; 3326 ixa->ixa_flags &= ~IXAF_IPSEC_SECURE; 3327 } 3328 return (ip_output_attach_policy(mp, ipha, ip6h, connp, ixa)); 3329 } 3330 3331 /* 3332 * Handle T_UNITDATA_REQ with options. Both IPv4 and IPv6 3333 * Either tudr_mp or msg is set. If tudr_mp we take ancillary data from 3334 * the TPI options, otherwise we take them from msg_control. 3335 * If both sin and sin6 is set it is a connected socket and we use conn_faddr. 3336 * Always consumes mp; never consumes tudr_mp. 3337 */ 3338 static int 3339 icmp_output_ancillary(conn_t *connp, sin_t *sin, sin6_t *sin6, mblk_t *mp, 3340 mblk_t *tudr_mp, struct nmsghdr *msg, cred_t *cr, pid_t pid) 3341 { 3342 icmp_t *icmp = connp->conn_icmp; 3343 icmp_stack_t *is = icmp->icmp_is; 3344 int error; 3345 ip_xmit_attr_t *ixa; 3346 ip_pkt_t *ipp; 3347 in6_addr_t v6src; 3348 in6_addr_t v6dst; 3349 in6_addr_t v6nexthop; 3350 in_port_t dstport; 3351 uint32_t flowinfo; 3352 int is_absreq_failure = 0; 3353 conn_opt_arg_t coas, *coa; 3354 3355 ASSERT(tudr_mp != NULL || msg != NULL); 3356 3357 /* 3358 * Get ixa before checking state to handle a disconnect race. 3359 * 3360 * We need an exclusive copy of conn_ixa since the ancillary data 3361 * options might modify it. That copy has no pointers hence we 3362 * need to set them up once we've parsed the ancillary data. 3363 */ 3364 ixa = conn_get_ixa_exclusive(connp); 3365 if (ixa == NULL) { 3366 BUMP_MIB(&is->is_rawip_mib, rawipOutErrors); 3367 freemsg(mp); 3368 return (ENOMEM); 3369 } 3370 ASSERT(cr != NULL); 3371 ASSERT(!(ixa->ixa_free_flags & IXA_FREE_CRED)); 3372 ixa->ixa_cred = cr; 3373 ixa->ixa_cpid = pid; 3374 if (is_system_labeled()) { 3375 /* We need to restart with a label based on the cred */ 3376 ip_xmit_attr_restore_tsl(ixa, ixa->ixa_cred); 3377 } 3378 3379 /* In case previous destination was multicast or multirt */ 3380 ip_attr_newdst(ixa); 3381 3382 /* Get a copy of conn_xmit_ipp since the options might change it */ 3383 ipp = kmem_zalloc(sizeof (*ipp), KM_NOSLEEP); 3384 if (ipp == NULL) { 3385 ASSERT(!(ixa->ixa_free_flags & IXA_FREE_CRED)); 3386 ixa->ixa_cred = connp->conn_cred; /* Restore */ 3387 ixa->ixa_cpid = connp->conn_cpid; 3388 ixa_refrele(ixa); 3389 BUMP_MIB(&is->is_rawip_mib, rawipOutErrors); 3390 freemsg(mp); 3391 return (ENOMEM); 3392 } 3393 mutex_enter(&connp->conn_lock); 3394 error = ip_pkt_copy(&connp->conn_xmit_ipp, ipp, KM_NOSLEEP); 3395 mutex_exit(&connp->conn_lock); 3396 if (error != 0) { 3397 BUMP_MIB(&is->is_rawip_mib, rawipOutErrors); 3398 freemsg(mp); 3399 goto done; 3400 } 3401 3402 /* 3403 * Parse the options and update ixa and ipp as a result. 3404 */ 3405 3406 coa = &coas; 3407 coa->coa_connp = connp; 3408 coa->coa_ixa = ixa; 3409 coa->coa_ipp = ipp; 3410 coa->coa_ancillary = B_TRUE; 3411 coa->coa_changed = 0; 3412 3413 if (msg != NULL) { 3414 error = process_auxiliary_options(connp, msg->msg_control, 3415 msg->msg_controllen, coa, &icmp_opt_obj, icmp_opt_set, cr); 3416 } else { 3417 struct T_unitdata_req *tudr; 3418 3419 tudr = (struct T_unitdata_req *)tudr_mp->b_rptr; 3420 ASSERT(tudr->PRIM_type == T_UNITDATA_REQ); 3421 error = tpi_optcom_buf(connp->conn_wq, tudr_mp, 3422 &tudr->OPT_length, tudr->OPT_offset, cr, &icmp_opt_obj, 3423 coa, &is_absreq_failure); 3424 } 3425 if (error != 0) { 3426 /* 3427 * Note: No special action needed in this 3428 * module for "is_absreq_failure" 3429 */ 3430 freemsg(mp); 3431 BUMP_MIB(&is->is_rawip_mib, rawipOutErrors); 3432 goto done; 3433 } 3434 ASSERT(is_absreq_failure == 0); 3435 3436 mutex_enter(&connp->conn_lock); 3437 /* 3438 * If laddr is unspecified then we look at sin6_src_id. 3439 * We will give precedence to a source address set with IPV6_PKTINFO 3440 * (aka IPPF_ADDR) but that is handled in build_hdrs. However, we don't 3441 * want ip_attr_connect to select a source (since it can fail) when 3442 * IPV6_PKTINFO is specified. 3443 * If this doesn't result in a source address then we get a source 3444 * from ip_attr_connect() below. 3445 */ 3446 v6src = connp->conn_saddr_v6; 3447 if (sin != NULL) { 3448 IN6_IPADDR_TO_V4MAPPED(sin->sin_addr.s_addr, &v6dst); 3449 dstport = sin->sin_port; 3450 flowinfo = 0; 3451 ixa->ixa_flags &= ~IXAF_SCOPEID_SET; 3452 ixa->ixa_flags |= IXAF_IS_IPV4; 3453 } else if (sin6 != NULL) { 3454 boolean_t v4mapped; 3455 uint_t srcid; 3456 3457 v6dst = sin6->sin6_addr; 3458 dstport = sin6->sin6_port; 3459 flowinfo = sin6->sin6_flowinfo; 3460 srcid = sin6->__sin6_src_id; 3461 if (IN6_IS_ADDR_LINKSCOPE(&v6dst) && sin6->sin6_scope_id != 0) { 3462 ixa->ixa_scopeid = sin6->sin6_scope_id; 3463 ixa->ixa_flags |= IXAF_SCOPEID_SET; 3464 } else { 3465 ixa->ixa_flags &= ~IXAF_SCOPEID_SET; 3466 } 3467 v4mapped = IN6_IS_ADDR_V4MAPPED(&v6dst); 3468 if (v4mapped) 3469 ixa->ixa_flags |= IXAF_IS_IPV4; 3470 else 3471 ixa->ixa_flags &= ~IXAF_IS_IPV4; 3472 if (srcid != 0 && IN6_IS_ADDR_UNSPECIFIED(&v6src)) { 3473 if (!ip_srcid_find_id(srcid, &v6src, IPCL_ZONEID(connp), 3474 v4mapped, connp->conn_netstack)) { 3475 /* Mismatched v4mapped/v6 specified by srcid. */ 3476 mutex_exit(&connp->conn_lock); 3477 error = EADDRNOTAVAIL; 3478 goto failed; /* Does freemsg() and mib. */ 3479 } 3480 } 3481 } else { 3482 /* Connected case */ 3483 dstport = connp->conn_fport; 3484 v6dst = connp->conn_faddr_v6; 3485 flowinfo = connp->conn_flowinfo; 3486 } 3487 mutex_exit(&connp->conn_lock); 3488 /* Handle IP_PKTINFO/IPV6_PKTINFO setting source address. */ 3489 if (ipp->ipp_fields & IPPF_ADDR) { 3490 if (ixa->ixa_flags & IXAF_IS_IPV4) { 3491 if (IN6_IS_ADDR_V4MAPPED(&ipp->ipp_addr)) 3492 v6src = ipp->ipp_addr; 3493 } else { 3494 if (!IN6_IS_ADDR_V4MAPPED(&ipp->ipp_addr)) 3495 v6src = ipp->ipp_addr; 3496 } 3497 } 3498 /* 3499 * Allow source not assigned to the system 3500 * only if it is not a local addresses 3501 */ 3502 if (!V6_OR_V4_INADDR_ANY(v6src)) { 3503 ip_laddr_t laddr_type; 3504 3505 if (ixa->ixa_flags & IXAF_IS_IPV4) { 3506 ipaddr_t v4src; 3507 3508 IN6_V4MAPPED_TO_IPADDR(&v6src, v4src); 3509 laddr_type = ip_laddr_verify_v4(v4src, ixa->ixa_zoneid, 3510 is->is_netstack->netstack_ip, B_FALSE); 3511 } else { 3512 laddr_type = ip_laddr_verify_v6(&v6src, ixa->ixa_zoneid, 3513 is->is_netstack->netstack_ip, B_FALSE, B_FALSE); 3514 } 3515 if (laddr_type != IPVL_UNICAST_UP) 3516 ixa->ixa_flags &= ~IXAF_VERIFY_SOURCE; 3517 } 3518 3519 ip_attr_nexthop(ipp, ixa, &v6dst, &v6nexthop); 3520 error = ip_attr_connect(connp, ixa, &v6src, &v6dst, &v6nexthop, dstport, 3521 &v6src, NULL, IPDF_ALLOW_MCBC | IPDF_VERIFY_DST); 3522 3523 switch (error) { 3524 case 0: 3525 break; 3526 case EADDRNOTAVAIL: 3527 /* 3528 * IXAF_VERIFY_SOURCE tells us to pick a better source. 3529 * Don't have the application see that errno 3530 */ 3531 error = ENETUNREACH; 3532 goto failed; 3533 case ENETDOWN: 3534 /* 3535 * Have !ipif_addr_ready address; drop packet silently 3536 * until we can get applications to not send until we 3537 * are ready. 3538 */ 3539 error = 0; 3540 goto failed; 3541 case EHOSTUNREACH: 3542 case ENETUNREACH: 3543 if (ixa->ixa_ire != NULL) { 3544 /* 3545 * Let conn_ip_output/ire_send_noroute return 3546 * the error and send any local ICMP error. 3547 */ 3548 error = 0; 3549 break; 3550 } 3551 /* FALLTHRU */ 3552 default: 3553 failed: 3554 freemsg(mp); 3555 BUMP_MIB(&is->is_rawip_mib, rawipOutErrors); 3556 goto done; 3557 } 3558 3559 /* 3560 * We might be going to a different destination than last time, 3561 * thus check that TX allows the communication and compute any 3562 * needed label. 3563 * 3564 * TSOL Note: We have an exclusive ipp and ixa for this thread so we 3565 * don't have to worry about concurrent threads. 3566 */ 3567 if (is_system_labeled()) { 3568 /* 3569 * Check whether Trusted Solaris policy allows communication 3570 * with this host, and pretend that the destination is 3571 * unreachable if not. 3572 * Compute any needed label and place it in ipp_label_v4/v6. 3573 * 3574 * Later conn_build_hdr_template/conn_prepend_hdr takes 3575 * ipp_label_v4/v6 to form the packet. 3576 * 3577 * Tsol note: We have ipp structure local to this thread so 3578 * no locking is needed. 3579 */ 3580 error = conn_update_label(connp, ixa, &v6dst, ipp); 3581 if (error != 0) { 3582 freemsg(mp); 3583 BUMP_MIB(&is->is_rawip_mib, rawipOutErrors); 3584 goto done; 3585 } 3586 } 3587 mp = icmp_prepend_hdr(connp, ixa, ipp, &v6src, &v6dst, flowinfo, mp, 3588 &error); 3589 if (mp == NULL) { 3590 BUMP_MIB(&is->is_rawip_mib, rawipOutErrors); 3591 ASSERT(error != 0); 3592 goto done; 3593 } 3594 if (ixa->ixa_pktlen > IP_MAXPACKET) { 3595 error = EMSGSIZE; 3596 BUMP_MIB(&is->is_rawip_mib, rawipOutErrors); 3597 freemsg(mp); 3598 goto done; 3599 } 3600 3601 /* Policy might differ for different ICMP type/code */ 3602 mp = icmp_output_attach_policy(mp, connp, ixa); 3603 if (mp == NULL) { 3604 BUMP_MIB(&is->is_rawip_mib, rawipOutErrors); 3605 error = EHOSTUNREACH; /* IPsec policy failure */ 3606 goto done; 3607 } 3608 3609 /* We're done. Pass the packet to ip. */ 3610 BUMP_MIB(&is->is_rawip_mib, rawipOutDatagrams); 3611 3612 error = conn_ip_output(mp, ixa); 3613 if (!connp->conn_unspec_src) 3614 ixa->ixa_flags |= IXAF_VERIFY_SOURCE; 3615 /* No rawipOutErrors if an error since IP increases its error counter */ 3616 switch (error) { 3617 case 0: 3618 break; 3619 case EWOULDBLOCK: 3620 (void) ixa_check_drain_insert(connp, ixa); 3621 error = 0; 3622 break; 3623 case EADDRNOTAVAIL: 3624 /* 3625 * IXAF_VERIFY_SOURCE tells us to pick a better source. 3626 * Don't have the application see that errno 3627 */ 3628 error = ENETUNREACH; 3629 /* FALLTHRU */ 3630 default: 3631 mutex_enter(&connp->conn_lock); 3632 /* 3633 * Clear the source and v6lastdst so we call ip_attr_connect 3634 * for the next packet and try to pick a better source. 3635 */ 3636 if (connp->conn_mcbc_bind) 3637 connp->conn_saddr_v6 = ipv6_all_zeros; 3638 else 3639 connp->conn_saddr_v6 = connp->conn_bound_addr_v6; 3640 connp->conn_v6lastdst = ipv6_all_zeros; 3641 mutex_exit(&connp->conn_lock); 3642 break; 3643 } 3644 done: 3645 ASSERT(!(ixa->ixa_free_flags & IXA_FREE_CRED)); 3646 ixa->ixa_cred = connp->conn_cred; /* Restore */ 3647 ixa->ixa_cpid = connp->conn_cpid; 3648 ixa_refrele(ixa); 3649 ip_pkt_free(ipp); 3650 kmem_free(ipp, sizeof (*ipp)); 3651 return (error); 3652 } 3653 3654 /* 3655 * Handle sending an M_DATA for a connected socket. 3656 * Handles both IPv4 and IPv6. 3657 */ 3658 int 3659 icmp_output_connected(conn_t *connp, mblk_t *mp, cred_t *cr, pid_t pid) 3660 { 3661 icmp_t *icmp = connp->conn_icmp; 3662 icmp_stack_t *is = icmp->icmp_is; 3663 int error; 3664 ip_xmit_attr_t *ixa; 3665 boolean_t do_ipsec; 3666 3667 /* 3668 * If no other thread is using conn_ixa this just gets a reference to 3669 * conn_ixa. Otherwise we get a safe copy of conn_ixa. 3670 */ 3671 ixa = conn_get_ixa(connp, B_FALSE); 3672 if (ixa == NULL) { 3673 BUMP_MIB(&is->is_rawip_mib, rawipOutErrors); 3674 freemsg(mp); 3675 return (ENOMEM); 3676 } 3677 3678 ASSERT(cr != NULL); 3679 ASSERT(!(ixa->ixa_free_flags & IXA_FREE_CRED)); 3680 ixa->ixa_cred = cr; 3681 ixa->ixa_cpid = pid; 3682 3683 /* Defer IPsec if it might need to look at ICMP type/code */ 3684 switch (ixa->ixa_protocol) { 3685 case IPPROTO_ICMP: 3686 case IPPROTO_ICMPV6: 3687 do_ipsec = B_FALSE; 3688 break; 3689 default: 3690 do_ipsec = B_TRUE; 3691 } 3692 3693 mutex_enter(&connp->conn_lock); 3694 mp = icmp_prepend_header_template(connp, ixa, mp, 3695 &connp->conn_saddr_v6, connp->conn_flowinfo, &error); 3696 3697 if (mp == NULL) { 3698 ASSERT(error != 0); 3699 mutex_exit(&connp->conn_lock); 3700 ASSERT(!(ixa->ixa_free_flags & IXA_FREE_CRED)); 3701 ixa->ixa_cred = connp->conn_cred; /* Restore */ 3702 ixa->ixa_cpid = connp->conn_cpid; 3703 ixa_refrele(ixa); 3704 BUMP_MIB(&is->is_rawip_mib, rawipOutErrors); 3705 freemsg(mp); 3706 return (error); 3707 } 3708 3709 if (!do_ipsec) { 3710 /* Policy might differ for different ICMP type/code */ 3711 mp = icmp_output_attach_policy(mp, connp, ixa); 3712 if (mp == NULL) { 3713 mutex_exit(&connp->conn_lock); 3714 BUMP_MIB(&is->is_rawip_mib, rawipOutErrors); 3715 ASSERT(!(ixa->ixa_free_flags & IXA_FREE_CRED)); 3716 ixa->ixa_cred = connp->conn_cred; /* Restore */ 3717 ixa->ixa_cpid = connp->conn_cpid; 3718 ixa_refrele(ixa); 3719 return (EHOSTUNREACH); /* IPsec policy failure */ 3720 } 3721 } 3722 3723 /* 3724 * In case we got a safe copy of conn_ixa, or if opt_set made us a new 3725 * safe copy, then we need to fill in any pointers in it. 3726 */ 3727 if (ixa->ixa_ire == NULL) { 3728 in6_addr_t faddr, saddr; 3729 in6_addr_t nexthop; 3730 in_port_t fport; 3731 3732 saddr = connp->conn_saddr_v6; 3733 faddr = connp->conn_faddr_v6; 3734 fport = connp->conn_fport; 3735 ip_attr_nexthop(&connp->conn_xmit_ipp, ixa, &faddr, &nexthop); 3736 mutex_exit(&connp->conn_lock); 3737 3738 error = ip_attr_connect(connp, ixa, &saddr, &faddr, &nexthop, 3739 fport, NULL, NULL, IPDF_ALLOW_MCBC | IPDF_VERIFY_DST | 3740 (do_ipsec ? IPDF_IPSEC : 0)); 3741 switch (error) { 3742 case 0: 3743 break; 3744 case EADDRNOTAVAIL: 3745 /* 3746 * IXAF_VERIFY_SOURCE tells us to pick a better source. 3747 * Don't have the application see that errno 3748 */ 3749 error = ENETUNREACH; 3750 goto failed; 3751 case ENETDOWN: 3752 /* 3753 * Have !ipif_addr_ready address; drop packet silently 3754 * until we can get applications to not send until we 3755 * are ready. 3756 */ 3757 error = 0; 3758 goto failed; 3759 case EHOSTUNREACH: 3760 case ENETUNREACH: 3761 if (ixa->ixa_ire != NULL) { 3762 /* 3763 * Let conn_ip_output/ire_send_noroute return 3764 * the error and send any local ICMP error. 3765 */ 3766 error = 0; 3767 break; 3768 } 3769 /* FALLTHRU */ 3770 default: 3771 failed: 3772 ASSERT(!(ixa->ixa_free_flags & IXA_FREE_CRED)); 3773 ixa->ixa_cred = connp->conn_cred; /* Restore */ 3774 ixa->ixa_cpid = connp->conn_cpid; 3775 ixa_refrele(ixa); 3776 BUMP_MIB(&is->is_rawip_mib, rawipOutErrors); 3777 freemsg(mp); 3778 return (error); 3779 } 3780 } else { 3781 /* Done with conn_t */ 3782 mutex_exit(&connp->conn_lock); 3783 } 3784 3785 /* We're done. Pass the packet to ip. */ 3786 BUMP_MIB(&is->is_rawip_mib, rawipOutDatagrams); 3787 3788 error = conn_ip_output(mp, ixa); 3789 /* No rawipOutErrors if an error since IP increases its error counter */ 3790 switch (error) { 3791 case 0: 3792 break; 3793 case EWOULDBLOCK: 3794 (void) ixa_check_drain_insert(connp, ixa); 3795 error = 0; 3796 break; 3797 case EADDRNOTAVAIL: 3798 /* 3799 * IXAF_VERIFY_SOURCE tells us to pick a better source. 3800 * Don't have the application see that errno 3801 */ 3802 error = ENETUNREACH; 3803 break; 3804 } 3805 ASSERT(!(ixa->ixa_free_flags & IXA_FREE_CRED)); 3806 ixa->ixa_cred = connp->conn_cred; /* Restore */ 3807 ixa->ixa_cpid = connp->conn_cpid; 3808 ixa_refrele(ixa); 3809 return (error); 3810 } 3811 3812 /* 3813 * Handle sending an M_DATA to the last destination. 3814 * Handles both IPv4 and IPv6. 3815 * 3816 * NOTE: The caller must hold conn_lock and we drop it here. 3817 */ 3818 int 3819 icmp_output_lastdst(conn_t *connp, mblk_t *mp, cred_t *cr, pid_t pid, 3820 ip_xmit_attr_t *ixa) 3821 { 3822 icmp_t *icmp = connp->conn_icmp; 3823 icmp_stack_t *is = icmp->icmp_is; 3824 int error; 3825 boolean_t do_ipsec; 3826 3827 ASSERT(MUTEX_HELD(&connp->conn_lock)); 3828 ASSERT(ixa != NULL); 3829 3830 ASSERT(cr != NULL); 3831 ASSERT(!(ixa->ixa_free_flags & IXA_FREE_CRED)); 3832 ixa->ixa_cred = cr; 3833 ixa->ixa_cpid = pid; 3834 3835 /* Defer IPsec if it might need to look at ICMP type/code */ 3836 switch (ixa->ixa_protocol) { 3837 case IPPROTO_ICMP: 3838 case IPPROTO_ICMPV6: 3839 do_ipsec = B_FALSE; 3840 break; 3841 default: 3842 do_ipsec = B_TRUE; 3843 } 3844 3845 3846 mp = icmp_prepend_header_template(connp, ixa, mp, 3847 &connp->conn_v6lastsrc, connp->conn_lastflowinfo, &error); 3848 3849 if (mp == NULL) { 3850 ASSERT(error != 0); 3851 mutex_exit(&connp->conn_lock); 3852 ASSERT(!(ixa->ixa_free_flags & IXA_FREE_CRED)); 3853 ixa->ixa_cred = connp->conn_cred; /* Restore */ 3854 ixa->ixa_cpid = connp->conn_cpid; 3855 ixa_refrele(ixa); 3856 BUMP_MIB(&is->is_rawip_mib, rawipOutErrors); 3857 freemsg(mp); 3858 return (error); 3859 } 3860 3861 if (!do_ipsec) { 3862 /* Policy might differ for different ICMP type/code */ 3863 mp = icmp_output_attach_policy(mp, connp, ixa); 3864 if (mp == NULL) { 3865 mutex_exit(&connp->conn_lock); 3866 BUMP_MIB(&is->is_rawip_mib, rawipOutErrors); 3867 ASSERT(!(ixa->ixa_free_flags & IXA_FREE_CRED)); 3868 ixa->ixa_cred = connp->conn_cred; /* Restore */ 3869 ixa->ixa_cpid = connp->conn_cpid; 3870 ixa_refrele(ixa); 3871 return (EHOSTUNREACH); /* IPsec policy failure */ 3872 } 3873 } 3874 3875 /* 3876 * In case we got a safe copy of conn_ixa, or if opt_set made us a new 3877 * safe copy, then we need to fill in any pointers in it. 3878 */ 3879 if (ixa->ixa_ire == NULL) { 3880 in6_addr_t lastdst, lastsrc; 3881 in6_addr_t nexthop; 3882 in_port_t lastport; 3883 3884 lastsrc = connp->conn_v6lastsrc; 3885 lastdst = connp->conn_v6lastdst; 3886 lastport = connp->conn_lastdstport; 3887 ip_attr_nexthop(&connp->conn_xmit_ipp, ixa, &lastdst, &nexthop); 3888 mutex_exit(&connp->conn_lock); 3889 3890 error = ip_attr_connect(connp, ixa, &lastsrc, &lastdst, 3891 &nexthop, lastport, NULL, NULL, IPDF_ALLOW_MCBC | 3892 IPDF_VERIFY_DST | (do_ipsec ? IPDF_IPSEC : 0)); 3893 switch (error) { 3894 case 0: 3895 break; 3896 case EADDRNOTAVAIL: 3897 /* 3898 * IXAF_VERIFY_SOURCE tells us to pick a better source. 3899 * Don't have the application see that errno 3900 */ 3901 error = ENETUNREACH; 3902 goto failed; 3903 case ENETDOWN: 3904 /* 3905 * Have !ipif_addr_ready address; drop packet silently 3906 * until we can get applications to not send until we 3907 * are ready. 3908 */ 3909 error = 0; 3910 goto failed; 3911 case EHOSTUNREACH: 3912 case ENETUNREACH: 3913 if (ixa->ixa_ire != NULL) { 3914 /* 3915 * Let conn_ip_output/ire_send_noroute return 3916 * the error and send any local ICMP error. 3917 */ 3918 error = 0; 3919 break; 3920 } 3921 /* FALLTHRU */ 3922 default: 3923 failed: 3924 ASSERT(!(ixa->ixa_free_flags & IXA_FREE_CRED)); 3925 ixa->ixa_cred = connp->conn_cred; /* Restore */ 3926 ixa->ixa_cpid = connp->conn_cpid; 3927 ixa_refrele(ixa); 3928 BUMP_MIB(&is->is_rawip_mib, rawipOutErrors); 3929 freemsg(mp); 3930 return (error); 3931 } 3932 } else { 3933 /* Done with conn_t */ 3934 mutex_exit(&connp->conn_lock); 3935 } 3936 3937 /* We're done. Pass the packet to ip. */ 3938 BUMP_MIB(&is->is_rawip_mib, rawipOutDatagrams); 3939 error = conn_ip_output(mp, ixa); 3940 /* No rawipOutErrors if an error since IP increases its error counter */ 3941 switch (error) { 3942 case 0: 3943 break; 3944 case EWOULDBLOCK: 3945 (void) ixa_check_drain_insert(connp, ixa); 3946 error = 0; 3947 break; 3948 case EADDRNOTAVAIL: 3949 /* 3950 * IXAF_VERIFY_SOURCE tells us to pick a better source. 3951 * Don't have the application see that errno 3952 */ 3953 error = ENETUNREACH; 3954 /* FALLTHRU */ 3955 default: 3956 mutex_enter(&connp->conn_lock); 3957 /* 3958 * Clear the source and v6lastdst so we call ip_attr_connect 3959 * for the next packet and try to pick a better source. 3960 */ 3961 if (connp->conn_mcbc_bind) 3962 connp->conn_saddr_v6 = ipv6_all_zeros; 3963 else 3964 connp->conn_saddr_v6 = connp->conn_bound_addr_v6; 3965 connp->conn_v6lastdst = ipv6_all_zeros; 3966 mutex_exit(&connp->conn_lock); 3967 break; 3968 } 3969 ASSERT(!(ixa->ixa_free_flags & IXA_FREE_CRED)); 3970 ixa->ixa_cred = connp->conn_cred; /* Restore */ 3971 ixa->ixa_cpid = connp->conn_cpid; 3972 ixa_refrele(ixa); 3973 return (error); 3974 } 3975 3976 3977 /* 3978 * Prepend the header template and then fill in the source and 3979 * flowinfo. The caller needs to handle the destination address since 3980 * it's setting is different if rthdr or source route. 3981 * 3982 * Returns NULL is allocation failed or if the packet would exceed IP_MAXPACKET. 3983 * When it returns NULL it sets errorp. 3984 */ 3985 static mblk_t * 3986 icmp_prepend_header_template(conn_t *connp, ip_xmit_attr_t *ixa, mblk_t *mp, 3987 const in6_addr_t *v6src, uint32_t flowinfo, int *errorp) 3988 { 3989 icmp_t *icmp = connp->conn_icmp; 3990 icmp_stack_t *is = icmp->icmp_is; 3991 uint_t pktlen; 3992 uint_t copylen; 3993 uint8_t *iph; 3994 uint_t ip_hdr_length; 3995 uint32_t cksum; 3996 ip_pkt_t *ipp; 3997 3998 ASSERT(MUTEX_HELD(&connp->conn_lock)); 3999 4000 /* 4001 * Copy the header template. 4002 */ 4003 copylen = connp->conn_ht_iphc_len; 4004 pktlen = copylen + msgdsize(mp); 4005 if (pktlen > IP_MAXPACKET) { 4006 freemsg(mp); 4007 *errorp = EMSGSIZE; 4008 return (NULL); 4009 } 4010 ixa->ixa_pktlen = pktlen; 4011 4012 /* check/fix buffer config, setup pointers into it */ 4013 iph = mp->b_rptr - copylen; 4014 if (DB_REF(mp) != 1 || iph < DB_BASE(mp) || !OK_32PTR(iph)) { 4015 mblk_t *mp1; 4016 4017 mp1 = allocb(copylen + is->is_wroff_extra, BPRI_MED); 4018 if (mp1 == NULL) { 4019 freemsg(mp); 4020 *errorp = ENOMEM; 4021 return (NULL); 4022 } 4023 mp1->b_wptr = DB_LIM(mp1); 4024 mp1->b_cont = mp; 4025 mp = mp1; 4026 iph = (mp->b_wptr - copylen); 4027 } 4028 mp->b_rptr = iph; 4029 bcopy(connp->conn_ht_iphc, iph, copylen); 4030 ip_hdr_length = (uint_t)(connp->conn_ht_ulp - connp->conn_ht_iphc); 4031 4032 ixa->ixa_ip_hdr_length = ip_hdr_length; 4033 4034 /* 4035 * Prepare for ICMPv6 checksum done in IP. 4036 * 4037 * icmp_build_hdr_template has already massaged any routing header 4038 * and placed the result in conn_sum. 4039 * 4040 * We make it easy for IP to include our pseudo header 4041 * by putting our length (and any routing header adjustment) 4042 * in the ICMPv6 checksum field. 4043 */ 4044 cksum = pktlen - ip_hdr_length; 4045 4046 cksum += connp->conn_sum; 4047 cksum = (cksum >> 16) + (cksum & 0xFFFF); 4048 ASSERT(cksum < 0x10000); 4049 4050 ipp = &connp->conn_xmit_ipp; 4051 if (ixa->ixa_flags & IXAF_IS_IPV4) { 4052 ipha_t *ipha = (ipha_t *)iph; 4053 4054 ipha->ipha_length = htons((uint16_t)pktlen); 4055 4056 /* if IP_PKTINFO specified an addres it wins over bind() */ 4057 if ((ipp->ipp_fields & IPPF_ADDR) && 4058 IN6_IS_ADDR_V4MAPPED(&ipp->ipp_addr)) { 4059 ASSERT(ipp->ipp_addr_v4 != INADDR_ANY); 4060 ipha->ipha_src = ipp->ipp_addr_v4; 4061 } else { 4062 IN6_V4MAPPED_TO_IPADDR(v6src, ipha->ipha_src); 4063 } 4064 } else { 4065 ip6_t *ip6h = (ip6_t *)iph; 4066 uint_t cksum_offset = 0; 4067 4068 ip6h->ip6_plen = htons((uint16_t)(pktlen - IPV6_HDR_LEN)); 4069 4070 /* if IP_PKTINFO specified an addres it wins over bind() */ 4071 if ((ipp->ipp_fields & IPPF_ADDR) && 4072 !IN6_IS_ADDR_V4MAPPED(&ipp->ipp_addr)) { 4073 ASSERT(!IN6_IS_ADDR_UNSPECIFIED(&ipp->ipp_addr)); 4074 ip6h->ip6_src = ipp->ipp_addr; 4075 } else { 4076 ip6h->ip6_src = *v6src; 4077 } 4078 ip6h->ip6_vcf = 4079 (IPV6_DEFAULT_VERS_AND_FLOW & IPV6_VERS_AND_FLOW_MASK) | 4080 (flowinfo & ~IPV6_VERS_AND_FLOW_MASK); 4081 if (ipp->ipp_fields & IPPF_TCLASS) { 4082 /* Overrides the class part of flowinfo */ 4083 ip6h->ip6_vcf = IPV6_TCLASS_FLOW(ip6h->ip6_vcf, 4084 ipp->ipp_tclass); 4085 } 4086 4087 if (ixa->ixa_flags & IXAF_SET_ULP_CKSUM) { 4088 if (connp->conn_proto == IPPROTO_ICMPV6) { 4089 cksum_offset = ixa->ixa_ip_hdr_length + 4090 offsetof(icmp6_t, icmp6_cksum); 4091 } else if (ixa->ixa_flags & IXAF_SET_RAW_CKSUM) { 4092 cksum_offset = ixa->ixa_ip_hdr_length + 4093 ixa->ixa_raw_cksum_offset; 4094 } 4095 } 4096 if (cksum_offset != 0) { 4097 uint16_t *ptr; 4098 4099 /* Make sure the checksum fits in the first mblk */ 4100 if (cksum_offset + sizeof (short) > MBLKL(mp)) { 4101 mblk_t *mp1; 4102 4103 mp1 = msgpullup(mp, 4104 cksum_offset + sizeof (short)); 4105 freemsg(mp); 4106 if (mp1 == NULL) { 4107 *errorp = ENOMEM; 4108 return (NULL); 4109 } 4110 mp = mp1; 4111 iph = mp->b_rptr; 4112 ip6h = (ip6_t *)iph; 4113 } 4114 ptr = (uint16_t *)(mp->b_rptr + cksum_offset); 4115 *ptr = htons(cksum); 4116 } 4117 } 4118 4119 return (mp); 4120 } 4121 4122 /* 4123 * This routine handles all messages passed downstream. It either 4124 * consumes the message or passes it downstream; it never queues a 4125 * a message. 4126 */ 4127 int 4128 icmp_wput(queue_t *q, mblk_t *mp) 4129 { 4130 sin6_t *sin6; 4131 sin_t *sin = NULL; 4132 uint_t srcid; 4133 conn_t *connp = Q_TO_CONN(q); 4134 icmp_t *icmp = connp->conn_icmp; 4135 int error = 0; 4136 struct sockaddr *addr = NULL; 4137 socklen_t addrlen; 4138 icmp_stack_t *is = icmp->icmp_is; 4139 struct T_unitdata_req *tudr; 4140 mblk_t *data_mp; 4141 cred_t *cr; 4142 pid_t pid; 4143 4144 /* 4145 * We directly handle several cases here: T_UNITDATA_REQ message 4146 * coming down as M_PROTO/M_PCPROTO and M_DATA messages for connected 4147 * socket. 4148 */ 4149 switch (DB_TYPE(mp)) { 4150 case M_DATA: 4151 /* sockfs never sends down M_DATA */ 4152 BUMP_MIB(&is->is_rawip_mib, rawipOutErrors); 4153 freemsg(mp); 4154 return (0); 4155 4156 case M_PROTO: 4157 case M_PCPROTO: 4158 tudr = (struct T_unitdata_req *)mp->b_rptr; 4159 if (MBLKL(mp) < sizeof (*tudr) || 4160 ((t_primp_t)mp->b_rptr)->type != T_UNITDATA_REQ) { 4161 icmp_wput_other(q, mp); 4162 return (0); 4163 } 4164 break; 4165 4166 default: 4167 icmp_wput_other(q, mp); 4168 return (0); 4169 } 4170 4171 /* Handle valid T_UNITDATA_REQ here */ 4172 data_mp = mp->b_cont; 4173 if (data_mp == NULL) { 4174 error = EPROTO; 4175 goto ud_error2; 4176 } 4177 mp->b_cont = NULL; 4178 4179 if (!MBLKIN(mp, 0, tudr->DEST_offset + tudr->DEST_length)) { 4180 error = EADDRNOTAVAIL; 4181 goto ud_error2; 4182 } 4183 4184 /* 4185 * All Solaris components should pass a db_credp 4186 * for this message, hence we ASSERT. 4187 * On production kernels we return an error to be robust against 4188 * random streams modules sitting on top of us. 4189 */ 4190 cr = msg_getcred(mp, &pid); 4191 ASSERT(cr != NULL); 4192 if (cr == NULL) { 4193 error = EINVAL; 4194 goto ud_error2; 4195 } 4196 4197 /* 4198 * If a port has not been bound to the stream, fail. 4199 * This is not a problem when sockfs is directly 4200 * above us, because it will ensure that the socket 4201 * is first bound before allowing data to be sent. 4202 */ 4203 if (icmp->icmp_state == TS_UNBND) { 4204 error = EPROTO; 4205 goto ud_error2; 4206 } 4207 addr = (struct sockaddr *)&mp->b_rptr[tudr->DEST_offset]; 4208 addrlen = tudr->DEST_length; 4209 4210 switch (connp->conn_family) { 4211 case AF_INET6: 4212 sin6 = (sin6_t *)addr; 4213 if (!OK_32PTR((char *)sin6) || (addrlen != sizeof (sin6_t)) || 4214 (sin6->sin6_family != AF_INET6)) { 4215 error = EADDRNOTAVAIL; 4216 goto ud_error2; 4217 } 4218 4219 /* No support for mapped addresses on raw sockets */ 4220 if (IN6_IS_ADDR_V4MAPPED(&sin6->sin6_addr)) { 4221 error = EADDRNOTAVAIL; 4222 goto ud_error2; 4223 } 4224 srcid = sin6->__sin6_src_id; 4225 4226 /* 4227 * If the local address is a mapped address return 4228 * an error. 4229 * It would be possible to send an IPv6 packet but the 4230 * response would never make it back to the application 4231 * since it is bound to a mapped address. 4232 */ 4233 if (IN6_IS_ADDR_V4MAPPED(&connp->conn_saddr_v6)) { 4234 error = EADDRNOTAVAIL; 4235 goto ud_error2; 4236 } 4237 4238 if (IN6_IS_ADDR_UNSPECIFIED(&sin6->sin6_addr)) 4239 sin6->sin6_addr = ipv6_loopback; 4240 4241 if (tudr->OPT_length != 0) { 4242 /* 4243 * If we are connected then the destination needs to be 4244 * the same as the connected one. 4245 */ 4246 if (icmp->icmp_state == TS_DATA_XFER && 4247 !conn_same_as_last_v6(connp, sin6)) { 4248 error = EISCONN; 4249 goto ud_error2; 4250 } 4251 error = icmp_output_ancillary(connp, NULL, sin6, 4252 data_mp, mp, NULL, cr, pid); 4253 } else { 4254 ip_xmit_attr_t *ixa; 4255 4256 /* 4257 * We have to allocate an ip_xmit_attr_t before we grab 4258 * conn_lock and we need to hold conn_lock once we've 4259 * checked conn_same_as_last_v6 to handle concurrent 4260 * send* calls on a socket. 4261 */ 4262 ixa = conn_get_ixa(connp, B_FALSE); 4263 if (ixa == NULL) { 4264 error = ENOMEM; 4265 goto ud_error2; 4266 } 4267 mutex_enter(&connp->conn_lock); 4268 4269 if (conn_same_as_last_v6(connp, sin6) && 4270 connp->conn_lastsrcid == srcid && 4271 ipsec_outbound_policy_current(ixa)) { 4272 /* icmp_output_lastdst drops conn_lock */ 4273 error = icmp_output_lastdst(connp, data_mp, cr, 4274 pid, ixa); 4275 } else { 4276 /* icmp_output_newdst drops conn_lock */ 4277 error = icmp_output_newdst(connp, data_mp, NULL, 4278 sin6, cr, pid, ixa); 4279 } 4280 ASSERT(MUTEX_NOT_HELD(&connp->conn_lock)); 4281 } 4282 if (error == 0) { 4283 freeb(mp); 4284 return (0); 4285 } 4286 break; 4287 4288 case AF_INET: 4289 sin = (sin_t *)addr; 4290 if ((!OK_32PTR((char *)sin) || addrlen != sizeof (sin_t)) || 4291 (sin->sin_family != AF_INET)) { 4292 error = EADDRNOTAVAIL; 4293 goto ud_error2; 4294 } 4295 if (sin->sin_addr.s_addr == INADDR_ANY) 4296 sin->sin_addr.s_addr = htonl(INADDR_LOOPBACK); 4297 4298 /* Protocol 255 contains full IP headers */ 4299 /* Read without holding lock */ 4300 if (icmp->icmp_hdrincl) { 4301 if (MBLKL(data_mp) < IP_SIMPLE_HDR_LENGTH) { 4302 if (!pullupmsg(data_mp, IP_SIMPLE_HDR_LENGTH)) { 4303 error = EINVAL; 4304 goto ud_error2; 4305 } 4306 } 4307 error = icmp_output_hdrincl(connp, data_mp, cr, pid); 4308 if (error == 0) { 4309 freeb(mp); 4310 return (0); 4311 } 4312 /* data_mp consumed above */ 4313 data_mp = NULL; 4314 goto ud_error2; 4315 } 4316 4317 if (tudr->OPT_length != 0) { 4318 /* 4319 * If we are connected then the destination needs to be 4320 * the same as the connected one. 4321 */ 4322 if (icmp->icmp_state == TS_DATA_XFER && 4323 !conn_same_as_last_v4(connp, sin)) { 4324 error = EISCONN; 4325 goto ud_error2; 4326 } 4327 error = icmp_output_ancillary(connp, sin, NULL, 4328 data_mp, mp, NULL, cr, pid); 4329 } else { 4330 ip_xmit_attr_t *ixa; 4331 4332 /* 4333 * We have to allocate an ip_xmit_attr_t before we grab 4334 * conn_lock and we need to hold conn_lock once we've 4335 * checked conn_same_as_last_v4 to handle concurrent 4336 * send* calls on a socket. 4337 */ 4338 ixa = conn_get_ixa(connp, B_FALSE); 4339 if (ixa == NULL) { 4340 error = ENOMEM; 4341 goto ud_error2; 4342 } 4343 mutex_enter(&connp->conn_lock); 4344 4345 if (conn_same_as_last_v4(connp, sin) && 4346 ipsec_outbound_policy_current(ixa)) { 4347 /* icmp_output_lastdst drops conn_lock */ 4348 error = icmp_output_lastdst(connp, data_mp, cr, 4349 pid, ixa); 4350 } else { 4351 /* icmp_output_newdst drops conn_lock */ 4352 error = icmp_output_newdst(connp, data_mp, sin, 4353 NULL, cr, pid, ixa); 4354 } 4355 ASSERT(MUTEX_NOT_HELD(&connp->conn_lock)); 4356 } 4357 if (error == 0) { 4358 freeb(mp); 4359 return (0); 4360 } 4361 break; 4362 } 4363 ASSERT(mp != NULL); 4364 /* mp is freed by the following routine */ 4365 icmp_ud_err(q, mp, (t_scalar_t)error); 4366 return (0); 4367 4368 ud_error2: 4369 BUMP_MIB(&is->is_rawip_mib, rawipOutErrors); 4370 freemsg(data_mp); 4371 ASSERT(mp != NULL); 4372 /* mp is freed by the following routine */ 4373 icmp_ud_err(q, mp, (t_scalar_t)error); 4374 return (0); 4375 } 4376 4377 /* 4378 * Handle the case of the IP address or flow label being different 4379 * for both IPv4 and IPv6. 4380 * 4381 * NOTE: The caller must hold conn_lock and we drop it here. 4382 */ 4383 static int 4384 icmp_output_newdst(conn_t *connp, mblk_t *data_mp, sin_t *sin, sin6_t *sin6, 4385 cred_t *cr, pid_t pid, ip_xmit_attr_t *ixa) 4386 { 4387 icmp_t *icmp = connp->conn_icmp; 4388 icmp_stack_t *is = icmp->icmp_is; 4389 int error; 4390 ip_xmit_attr_t *oldixa; 4391 boolean_t do_ipsec; 4392 uint_t srcid; 4393 uint32_t flowinfo; 4394 in6_addr_t v6src; 4395 in6_addr_t v6dst; 4396 in6_addr_t v6nexthop; 4397 in_port_t dstport; 4398 4399 ASSERT(MUTEX_HELD(&connp->conn_lock)); 4400 ASSERT(ixa != NULL); 4401 4402 /* 4403 * We hold conn_lock across all the use and modifications of 4404 * the conn_lastdst, conn_ixa, and conn_xmit_ipp to ensure that they 4405 * stay consistent. 4406 */ 4407 4408 ASSERT(cr != NULL); 4409 ASSERT(!(ixa->ixa_free_flags & IXA_FREE_CRED)); 4410 ixa->ixa_cred = cr; 4411 ixa->ixa_cpid = pid; 4412 if (is_system_labeled()) { 4413 /* We need to restart with a label based on the cred */ 4414 ip_xmit_attr_restore_tsl(ixa, ixa->ixa_cred); 4415 } 4416 /* 4417 * If we are connected then the destination needs to be the 4418 * same as the connected one, which is not the case here since we 4419 * checked for that above. 4420 */ 4421 if (icmp->icmp_state == TS_DATA_XFER) { 4422 mutex_exit(&connp->conn_lock); 4423 error = EISCONN; 4424 goto ud_error; 4425 } 4426 4427 /* 4428 * Before we modify the ixa at all, invalidate our most recent address 4429 * to assure that any subsequent call to conn_same_as_last_v6() will 4430 * not indicate a match: any thread that picks up conn_lock after we 4431 * drop it (but before we pick it up again and properly set the most 4432 * recent address) must not associate the ixa with the (now old) last 4433 * address. 4434 */ 4435 connp->conn_v6lastdst = ipv6_all_zeros; 4436 4437 /* In case previous destination was multicast or multirt */ 4438 ip_attr_newdst(ixa); 4439 4440 /* 4441 * If laddr is unspecified then we look at sin6_src_id. 4442 * We will give precedence to a source address set with IPV6_PKTINFO 4443 * (aka IPPF_ADDR) but that is handled in build_hdrs. However, we don't 4444 * want ip_attr_connect to select a source (since it can fail) when 4445 * IPV6_PKTINFO is specified. 4446 * If this doesn't result in a source address then we get a source 4447 * from ip_attr_connect() below. 4448 */ 4449 v6src = connp->conn_saddr_v6; 4450 if (sin != NULL) { 4451 IN6_IPADDR_TO_V4MAPPED(sin->sin_addr.s_addr, &v6dst); 4452 dstport = sin->sin_port; 4453 flowinfo = 0; 4454 /* Don't bother with ip_srcid_find_id(), but indicate anyway. */ 4455 srcid = 0; 4456 ixa->ixa_flags &= ~IXAF_SCOPEID_SET; 4457 ixa->ixa_flags |= IXAF_IS_IPV4; 4458 } else { 4459 boolean_t v4mapped; 4460 4461 v6dst = sin6->sin6_addr; 4462 dstport = sin6->sin6_port; 4463 flowinfo = sin6->sin6_flowinfo; 4464 srcid = sin6->__sin6_src_id; 4465 if (IN6_IS_ADDR_LINKSCOPE(&v6dst) && sin6->sin6_scope_id != 0) { 4466 ixa->ixa_scopeid = sin6->sin6_scope_id; 4467 ixa->ixa_flags |= IXAF_SCOPEID_SET; 4468 } else { 4469 ixa->ixa_flags &= ~IXAF_SCOPEID_SET; 4470 } 4471 v4mapped = IN6_IS_ADDR_V4MAPPED(&v6dst); 4472 if (v4mapped) 4473 ixa->ixa_flags |= IXAF_IS_IPV4; 4474 else 4475 ixa->ixa_flags &= ~IXAF_IS_IPV4; 4476 if (srcid != 0 && IN6_IS_ADDR_UNSPECIFIED(&v6src)) { 4477 if (!ip_srcid_find_id(srcid, &v6src, IPCL_ZONEID(connp), 4478 v4mapped, connp->conn_netstack)) { 4479 /* Mismatched v4mapped/v6 specified by srcid. */ 4480 mutex_exit(&connp->conn_lock); 4481 error = EADDRNOTAVAIL; 4482 goto ud_error; 4483 } 4484 } 4485 } 4486 /* Handle IP_PKTINFO/IPV6_PKTINFO setting source address. */ 4487 if (connp->conn_xmit_ipp.ipp_fields & IPPF_ADDR) { 4488 ip_pkt_t *ipp = &connp->conn_xmit_ipp; 4489 4490 if (ixa->ixa_flags & IXAF_IS_IPV4) { 4491 if (IN6_IS_ADDR_V4MAPPED(&ipp->ipp_addr)) 4492 v6src = ipp->ipp_addr; 4493 } else { 4494 if (!IN6_IS_ADDR_V4MAPPED(&ipp->ipp_addr)) 4495 v6src = ipp->ipp_addr; 4496 } 4497 } 4498 4499 /* Defer IPsec if it might need to look at ICMP type/code */ 4500 switch (ixa->ixa_protocol) { 4501 case IPPROTO_ICMP: 4502 case IPPROTO_ICMPV6: 4503 do_ipsec = B_FALSE; 4504 break; 4505 default: 4506 do_ipsec = B_TRUE; 4507 } 4508 4509 ip_attr_nexthop(&connp->conn_xmit_ipp, ixa, &v6dst, &v6nexthop); 4510 mutex_exit(&connp->conn_lock); 4511 4512 error = ip_attr_connect(connp, ixa, &v6src, &v6dst, &v6nexthop, dstport, 4513 &v6src, NULL, IPDF_ALLOW_MCBC | IPDF_VERIFY_DST | 4514 (do_ipsec ? IPDF_IPSEC : 0)); 4515 switch (error) { 4516 case 0: 4517 break; 4518 case EADDRNOTAVAIL: 4519 /* 4520 * IXAF_VERIFY_SOURCE tells us to pick a better source. 4521 * Don't have the application see that errno 4522 */ 4523 error = ENETUNREACH; 4524 goto failed; 4525 case ENETDOWN: 4526 /* 4527 * Have !ipif_addr_ready address; drop packet silently 4528 * until we can get applications to not send until we 4529 * are ready. 4530 */ 4531 error = 0; 4532 goto failed; 4533 case EHOSTUNREACH: 4534 case ENETUNREACH: 4535 if (ixa->ixa_ire != NULL) { 4536 /* 4537 * Let conn_ip_output/ire_send_noroute return 4538 * the error and send any local ICMP error. 4539 */ 4540 error = 0; 4541 break; 4542 } 4543 /* FALLTHRU */ 4544 default: 4545 failed: 4546 goto ud_error; 4547 } 4548 4549 mutex_enter(&connp->conn_lock); 4550 /* 4551 * While we dropped the lock some other thread might have connected 4552 * this socket. If so we bail out with EISCONN to ensure that the 4553 * connecting thread is the one that updates conn_ixa, conn_ht_* 4554 * and conn_*last*. 4555 */ 4556 if (icmp->icmp_state == TS_DATA_XFER) { 4557 mutex_exit(&connp->conn_lock); 4558 error = EISCONN; 4559 goto ud_error; 4560 } 4561 4562 /* 4563 * We need to rebuild the headers if 4564 * - we are labeling packets (could be different for different 4565 * destinations) 4566 * - we have a source route (or routing header) since we need to 4567 * massage that to get the pseudo-header checksum 4568 * - a socket option with COA_HEADER_CHANGED has been set which 4569 * set conn_v6lastdst to zero. 4570 * 4571 * Otherwise the prepend function will just update the src, dst, 4572 * and flow label. 4573 */ 4574 if (is_system_labeled()) { 4575 /* TX MLP requires SCM_UCRED and don't have that here */ 4576 if (connp->conn_mlp_type != mlptSingle) { 4577 mutex_exit(&connp->conn_lock); 4578 error = ECONNREFUSED; 4579 goto ud_error; 4580 } 4581 /* 4582 * Check whether Trusted Solaris policy allows communication 4583 * with this host, and pretend that the destination is 4584 * unreachable if not. 4585 * Compute any needed label and place it in ipp_label_v4/v6. 4586 * 4587 * Later conn_build_hdr_template/conn_prepend_hdr takes 4588 * ipp_label_v4/v6 to form the packet. 4589 * 4590 * Tsol note: Since we hold conn_lock we know no other 4591 * thread manipulates conn_xmit_ipp. 4592 */ 4593 error = conn_update_label(connp, ixa, &v6dst, 4594 &connp->conn_xmit_ipp); 4595 if (error != 0) { 4596 mutex_exit(&connp->conn_lock); 4597 goto ud_error; 4598 } 4599 /* Rebuild the header template */ 4600 error = icmp_build_hdr_template(connp, &v6src, &v6dst, 4601 flowinfo); 4602 if (error != 0) { 4603 mutex_exit(&connp->conn_lock); 4604 goto ud_error; 4605 } 4606 } else if (connp->conn_xmit_ipp.ipp_fields & 4607 (IPPF_IPV4_OPTIONS|IPPF_RTHDR) || 4608 IN6_IS_ADDR_UNSPECIFIED(&connp->conn_v6lastdst)) { 4609 /* Rebuild the header template */ 4610 error = icmp_build_hdr_template(connp, &v6src, &v6dst, 4611 flowinfo); 4612 if (error != 0) { 4613 mutex_exit(&connp->conn_lock); 4614 goto ud_error; 4615 } 4616 } else { 4617 /* Simply update the destination address if no source route */ 4618 if (ixa->ixa_flags & IXAF_IS_IPV4) { 4619 ipha_t *ipha = (ipha_t *)connp->conn_ht_iphc; 4620 4621 IN6_V4MAPPED_TO_IPADDR(&v6dst, ipha->ipha_dst); 4622 if (ixa->ixa_flags & IXAF_PMTU_IPV4_DF) { 4623 ipha->ipha_fragment_offset_and_flags |= 4624 IPH_DF_HTONS; 4625 } else { 4626 ipha->ipha_fragment_offset_and_flags &= 4627 ~IPH_DF_HTONS; 4628 } 4629 } else { 4630 ip6_t *ip6h = (ip6_t *)connp->conn_ht_iphc; 4631 ip6h->ip6_dst = v6dst; 4632 } 4633 } 4634 4635 /* 4636 * Remember the dst etc which corresponds to the built header 4637 * template and conn_ixa. 4638 */ 4639 oldixa = conn_replace_ixa(connp, ixa); 4640 connp->conn_v6lastdst = v6dst; 4641 connp->conn_lastflowinfo = flowinfo; 4642 connp->conn_lastscopeid = ixa->ixa_scopeid; 4643 connp->conn_lastsrcid = srcid; 4644 /* Also remember a source to use together with lastdst */ 4645 connp->conn_v6lastsrc = v6src; 4646 4647 data_mp = icmp_prepend_header_template(connp, ixa, data_mp, &v6src, 4648 flowinfo, &error); 4649 4650 /* Done with conn_t */ 4651 mutex_exit(&connp->conn_lock); 4652 ixa_refrele(oldixa); 4653 4654 if (data_mp == NULL) { 4655 ASSERT(error != 0); 4656 goto ud_error; 4657 } 4658 4659 if (!do_ipsec) { 4660 /* Policy might differ for different ICMP type/code */ 4661 data_mp = icmp_output_attach_policy(data_mp, connp, ixa); 4662 if (data_mp == NULL) { 4663 BUMP_MIB(&is->is_rawip_mib, rawipOutErrors); 4664 error = EHOSTUNREACH; /* IPsec policy failure */ 4665 goto done; 4666 } 4667 } 4668 4669 /* We're done. Pass the packet to ip. */ 4670 BUMP_MIB(&is->is_rawip_mib, rawipOutDatagrams); 4671 4672 error = conn_ip_output(data_mp, ixa); 4673 /* No rawipOutErrors if an error since IP increases its error counter */ 4674 switch (error) { 4675 case 0: 4676 break; 4677 case EWOULDBLOCK: 4678 (void) ixa_check_drain_insert(connp, ixa); 4679 error = 0; 4680 break; 4681 case EADDRNOTAVAIL: 4682 /* 4683 * IXAF_VERIFY_SOURCE tells us to pick a better source. 4684 * Don't have the application see that errno 4685 */ 4686 error = ENETUNREACH; 4687 /* FALLTHRU */ 4688 default: 4689 mutex_enter(&connp->conn_lock); 4690 /* 4691 * Clear the source and v6lastdst so we call ip_attr_connect 4692 * for the next packet and try to pick a better source. 4693 */ 4694 if (connp->conn_mcbc_bind) 4695 connp->conn_saddr_v6 = ipv6_all_zeros; 4696 else 4697 connp->conn_saddr_v6 = connp->conn_bound_addr_v6; 4698 connp->conn_v6lastdst = ipv6_all_zeros; 4699 mutex_exit(&connp->conn_lock); 4700 break; 4701 } 4702 done: 4703 ASSERT(!(ixa->ixa_free_flags & IXA_FREE_CRED)); 4704 ixa->ixa_cred = connp->conn_cred; /* Restore */ 4705 ixa->ixa_cpid = connp->conn_cpid; 4706 ixa_refrele(ixa); 4707 return (error); 4708 4709 ud_error: 4710 ASSERT(!(ixa->ixa_free_flags & IXA_FREE_CRED)); 4711 ixa->ixa_cred = connp->conn_cred; /* Restore */ 4712 ixa->ixa_cpid = connp->conn_cpid; 4713 ixa_refrele(ixa); 4714 4715 BUMP_MIB(&is->is_rawip_mib, rawipOutErrors); 4716 freemsg(data_mp); 4717 return (error); 4718 } 4719 4720 /* ARGSUSED */ 4721 static int 4722 icmp_wput_fallback(queue_t *q, mblk_t *mp) 4723 { 4724 #ifdef DEBUG 4725 cmn_err(CE_CONT, "icmp_wput_fallback: Message during fallback \n"); 4726 #endif 4727 freemsg(mp); 4728 return (0); 4729 } 4730 4731 static void 4732 icmp_wput_other(queue_t *q, mblk_t *mp) 4733 { 4734 uchar_t *rptr = mp->b_rptr; 4735 struct iocblk *iocp; 4736 conn_t *connp = Q_TO_CONN(q); 4737 icmp_t *icmp = connp->conn_icmp; 4738 cred_t *cr; 4739 4740 switch (mp->b_datap->db_type) { 4741 case M_PROTO: 4742 case M_PCPROTO: 4743 if (mp->b_wptr - rptr < sizeof (t_scalar_t)) { 4744 /* 4745 * If the message does not contain a PRIM_type, 4746 * throw it away. 4747 */ 4748 freemsg(mp); 4749 return; 4750 } 4751 switch (((t_primp_t)rptr)->type) { 4752 case T_ADDR_REQ: 4753 icmp_addr_req(q, mp); 4754 return; 4755 case O_T_BIND_REQ: 4756 case T_BIND_REQ: 4757 icmp_tpi_bind(q, mp); 4758 return; 4759 case T_CONN_REQ: 4760 icmp_tpi_connect(q, mp); 4761 return; 4762 case T_CAPABILITY_REQ: 4763 icmp_capability_req(q, mp); 4764 return; 4765 case T_INFO_REQ: 4766 icmp_info_req(q, mp); 4767 return; 4768 case T_UNITDATA_REQ: 4769 /* 4770 * If a T_UNITDATA_REQ gets here, the address must 4771 * be bad. Valid T_UNITDATA_REQs are handled 4772 * in icmp_wput. 4773 */ 4774 icmp_ud_err(q, mp, EADDRNOTAVAIL); 4775 return; 4776 case T_UNBIND_REQ: 4777 icmp_tpi_unbind(q, mp); 4778 return; 4779 case T_SVR4_OPTMGMT_REQ: 4780 /* 4781 * All Solaris components should pass a db_credp 4782 * for this TPI message, hence we ASSERT. 4783 * But in case there is some other M_PROTO that looks 4784 * like a TPI message sent by some other kernel 4785 * component, we check and return an error. 4786 */ 4787 cr = msg_getcred(mp, NULL); 4788 ASSERT(cr != NULL); 4789 if (cr == NULL) { 4790 icmp_err_ack(q, mp, TSYSERR, EINVAL); 4791 return; 4792 } 4793 4794 if (!snmpcom_req(q, mp, icmp_snmp_set, ip_snmp_get, 4795 cr)) { 4796 svr4_optcom_req(q, mp, cr, &icmp_opt_obj); 4797 } 4798 return; 4799 4800 case T_OPTMGMT_REQ: 4801 /* 4802 * All Solaris components should pass a db_credp 4803 * for this TPI message, hence we ASSERT. 4804 * But in case there is some other M_PROTO that looks 4805 * like a TPI message sent by some other kernel 4806 * component, we check and return an error. 4807 */ 4808 cr = msg_getcred(mp, NULL); 4809 ASSERT(cr != NULL); 4810 if (cr == NULL) { 4811 icmp_err_ack(q, mp, TSYSERR, EINVAL); 4812 return; 4813 } 4814 tpi_optcom_req(q, mp, cr, &icmp_opt_obj); 4815 return; 4816 4817 case T_DISCON_REQ: 4818 icmp_tpi_disconnect(q, mp); 4819 return; 4820 4821 /* The following TPI message is not supported by icmp. */ 4822 case O_T_CONN_RES: 4823 case T_CONN_RES: 4824 icmp_err_ack(q, mp, TNOTSUPPORT, 0); 4825 return; 4826 4827 /* The following 3 TPI requests are illegal for icmp. */ 4828 case T_DATA_REQ: 4829 case T_EXDATA_REQ: 4830 case T_ORDREL_REQ: 4831 icmp_err_ack(q, mp, TNOTSUPPORT, 0); 4832 return; 4833 default: 4834 break; 4835 } 4836 break; 4837 case M_FLUSH: 4838 if (*rptr & FLUSHW) 4839 flushq(q, FLUSHDATA); 4840 break; 4841 case M_IOCTL: 4842 iocp = (struct iocblk *)mp->b_rptr; 4843 switch (iocp->ioc_cmd) { 4844 case TI_GETPEERNAME: 4845 if (icmp->icmp_state != TS_DATA_XFER) { 4846 /* 4847 * If a default destination address has not 4848 * been associated with the stream, then we 4849 * don't know the peer's name. 4850 */ 4851 iocp->ioc_error = ENOTCONN; 4852 iocp->ioc_count = 0; 4853 mp->b_datap->db_type = M_IOCACK; 4854 qreply(q, mp); 4855 return; 4856 } 4857 /* FALLTHRU */ 4858 case TI_GETMYNAME: 4859 /* 4860 * For TI_GETPEERNAME and TI_GETMYNAME, we first 4861 * need to copyin the user's strbuf structure. 4862 * Processing will continue in the M_IOCDATA case 4863 * below. 4864 */ 4865 mi_copyin(q, mp, NULL, 4866 SIZEOF_STRUCT(strbuf, iocp->ioc_flag)); 4867 return; 4868 default: 4869 break; 4870 } 4871 break; 4872 case M_IOCDATA: 4873 icmp_wput_iocdata(q, mp); 4874 return; 4875 default: 4876 /* Unrecognized messages are passed through without change. */ 4877 break; 4878 } 4879 ip_wput_nondata(q, mp); 4880 } 4881 4882 /* 4883 * icmp_wput_iocdata is called by icmp_wput_other to handle all M_IOCDATA 4884 * messages. 4885 */ 4886 static void 4887 icmp_wput_iocdata(queue_t *q, mblk_t *mp) 4888 { 4889 mblk_t *mp1; 4890 STRUCT_HANDLE(strbuf, sb); 4891 uint_t addrlen; 4892 conn_t *connp = Q_TO_CONN(q); 4893 icmp_t *icmp = connp->conn_icmp; 4894 4895 /* Make sure it is one of ours. */ 4896 switch (((struct iocblk *)mp->b_rptr)->ioc_cmd) { 4897 case TI_GETMYNAME: 4898 case TI_GETPEERNAME: 4899 break; 4900 default: 4901 ip_wput_nondata(q, mp); 4902 return; 4903 } 4904 4905 switch (mi_copy_state(q, mp, &mp1)) { 4906 case -1: 4907 return; 4908 case MI_COPY_CASE(MI_COPY_IN, 1): 4909 break; 4910 case MI_COPY_CASE(MI_COPY_OUT, 1): 4911 /* 4912 * The address has been copied out, so now 4913 * copyout the strbuf. 4914 */ 4915 mi_copyout(q, mp); 4916 return; 4917 case MI_COPY_CASE(MI_COPY_OUT, 2): 4918 /* 4919 * The address and strbuf have been copied out. 4920 * We're done, so just acknowledge the original 4921 * M_IOCTL. 4922 */ 4923 mi_copy_done(q, mp, 0); 4924 return; 4925 default: 4926 /* 4927 * Something strange has happened, so acknowledge 4928 * the original M_IOCTL with an EPROTO error. 4929 */ 4930 mi_copy_done(q, mp, EPROTO); 4931 return; 4932 } 4933 4934 /* 4935 * Now we have the strbuf structure for TI_GETMYNAME 4936 * and TI_GETPEERNAME. Next we copyout the requested 4937 * address and then we'll copyout the strbuf. 4938 */ 4939 STRUCT_SET_HANDLE(sb, ((struct iocblk *)mp->b_rptr)->ioc_flag, 4940 (void *)mp1->b_rptr); 4941 4942 if (connp->conn_family == AF_INET) 4943 addrlen = sizeof (sin_t); 4944 else 4945 addrlen = sizeof (sin6_t); 4946 4947 if (STRUCT_FGET(sb, maxlen) < addrlen) { 4948 mi_copy_done(q, mp, EINVAL); 4949 return; 4950 } 4951 switch (((struct iocblk *)mp->b_rptr)->ioc_cmd) { 4952 case TI_GETMYNAME: 4953 break; 4954 case TI_GETPEERNAME: 4955 if (icmp->icmp_state != TS_DATA_XFER) { 4956 mi_copy_done(q, mp, ENOTCONN); 4957 return; 4958 } 4959 break; 4960 default: 4961 mi_copy_done(q, mp, EPROTO); 4962 return; 4963 } 4964 mp1 = mi_copyout_alloc(q, mp, STRUCT_FGETP(sb, buf), addrlen, B_TRUE); 4965 if (!mp1) 4966 return; 4967 4968 STRUCT_FSET(sb, len, addrlen); 4969 switch (((struct iocblk *)mp->b_rptr)->ioc_cmd) { 4970 case TI_GETMYNAME: 4971 (void) conn_getsockname(connp, (struct sockaddr *)mp1->b_wptr, 4972 &addrlen); 4973 break; 4974 case TI_GETPEERNAME: 4975 (void) conn_getpeername(connp, (struct sockaddr *)mp1->b_wptr, 4976 &addrlen); 4977 break; 4978 } 4979 mp1->b_wptr += addrlen; 4980 /* Copy out the address */ 4981 mi_copyout(q, mp); 4982 } 4983 4984 void 4985 icmp_ddi_g_init(void) 4986 { 4987 icmp_max_optsize = optcom_max_optsize(icmp_opt_obj.odb_opt_des_arr, 4988 icmp_opt_obj.odb_opt_arr_cnt); 4989 4990 /* 4991 * We want to be informed each time a stack is created or 4992 * destroyed in the kernel, so we can maintain the 4993 * set of icmp_stack_t's. 4994 */ 4995 netstack_register(NS_ICMP, rawip_stack_init, NULL, rawip_stack_fini); 4996 } 4997 4998 void 4999 icmp_ddi_g_destroy(void) 5000 { 5001 netstack_unregister(NS_ICMP); 5002 } 5003 5004 #define INET_NAME "ip" 5005 5006 /* 5007 * Initialize the ICMP stack instance. 5008 */ 5009 static void * 5010 rawip_stack_init(netstackid_t stackid, netstack_t *ns) 5011 { 5012 icmp_stack_t *is; 5013 int error = 0; 5014 size_t arrsz; 5015 major_t major; 5016 5017 is = (icmp_stack_t *)kmem_zalloc(sizeof (*is), KM_SLEEP); 5018 is->is_netstack = ns; 5019 5020 arrsz = sizeof (icmp_propinfo_tbl); 5021 is->is_propinfo_tbl = (mod_prop_info_t *)kmem_alloc(arrsz, KM_SLEEP); 5022 bcopy(icmp_propinfo_tbl, is->is_propinfo_tbl, arrsz); 5023 5024 is->is_ksp = rawip_kstat_init(stackid); 5025 5026 major = mod_name_to_major(INET_NAME); 5027 error = ldi_ident_from_major(major, &is->is_ldi_ident); 5028 ASSERT(error == 0); 5029 return (is); 5030 } 5031 5032 /* 5033 * Free the ICMP stack instance. 5034 */ 5035 static void 5036 rawip_stack_fini(netstackid_t stackid, void *arg) 5037 { 5038 icmp_stack_t *is = (icmp_stack_t *)arg; 5039 5040 kmem_free(is->is_propinfo_tbl, sizeof (icmp_propinfo_tbl)); 5041 is->is_propinfo_tbl = NULL; 5042 5043 rawip_kstat_fini(stackid, is->is_ksp); 5044 is->is_ksp = NULL; 5045 ldi_ident_release(is->is_ldi_ident); 5046 kmem_free(is, sizeof (*is)); 5047 } 5048 5049 static void * 5050 rawip_kstat_init(netstackid_t stackid) 5051 { 5052 kstat_t *ksp; 5053 5054 rawip_named_kstat_t template = { 5055 { "inDatagrams", KSTAT_DATA_UINT32, 0 }, 5056 { "inCksumErrs", KSTAT_DATA_UINT32, 0 }, 5057 { "inErrors", KSTAT_DATA_UINT32, 0 }, 5058 { "outDatagrams", KSTAT_DATA_UINT32, 0 }, 5059 { "outErrors", KSTAT_DATA_UINT32, 0 }, 5060 }; 5061 5062 ksp = kstat_create_netstack("icmp", 0, "rawip", "mib2", 5063 KSTAT_TYPE_NAMED, NUM_OF_FIELDS(rawip_named_kstat_t), 0, stackid); 5064 if (ksp == NULL || ksp->ks_data == NULL) 5065 return (NULL); 5066 5067 bcopy(&template, ksp->ks_data, sizeof (template)); 5068 ksp->ks_update = rawip_kstat_update; 5069 ksp->ks_private = (void *)(uintptr_t)stackid; 5070 5071 kstat_install(ksp); 5072 return (ksp); 5073 } 5074 5075 static void 5076 rawip_kstat_fini(netstackid_t stackid, kstat_t *ksp) 5077 { 5078 if (ksp != NULL) { 5079 ASSERT(stackid == (netstackid_t)(uintptr_t)ksp->ks_private); 5080 kstat_delete_netstack(ksp, stackid); 5081 } 5082 } 5083 5084 static int 5085 rawip_kstat_update(kstat_t *ksp, int rw) 5086 { 5087 rawip_named_kstat_t *rawipkp; 5088 netstackid_t stackid = (netstackid_t)(uintptr_t)ksp->ks_private; 5089 netstack_t *ns; 5090 icmp_stack_t *is; 5091 5092 if (ksp->ks_data == NULL) 5093 return (EIO); 5094 5095 if (rw == KSTAT_WRITE) 5096 return (EACCES); 5097 5098 rawipkp = (rawip_named_kstat_t *)ksp->ks_data; 5099 5100 ns = netstack_find_by_stackid(stackid); 5101 if (ns == NULL) 5102 return (-1); 5103 is = ns->netstack_icmp; 5104 if (is == NULL) { 5105 netstack_rele(ns); 5106 return (-1); 5107 } 5108 rawipkp->inDatagrams.value.ui32 = is->is_rawip_mib.rawipInDatagrams; 5109 rawipkp->inCksumErrs.value.ui32 = is->is_rawip_mib.rawipInCksumErrs; 5110 rawipkp->inErrors.value.ui32 = is->is_rawip_mib.rawipInErrors; 5111 rawipkp->outDatagrams.value.ui32 = is->is_rawip_mib.rawipOutDatagrams; 5112 rawipkp->outErrors.value.ui32 = is->is_rawip_mib.rawipOutErrors; 5113 netstack_rele(ns); 5114 return (0); 5115 } 5116 5117 /* ARGSUSED */ 5118 int 5119 rawip_accept(sock_lower_handle_t lproto_handle, 5120 sock_lower_handle_t eproto_handle, sock_upper_handle_t sock_handle, 5121 cred_t *cr) 5122 { 5123 return (EOPNOTSUPP); 5124 } 5125 5126 /* ARGSUSED */ 5127 int 5128 rawip_bind(sock_lower_handle_t proto_handle, struct sockaddr *sa, 5129 socklen_t len, cred_t *cr) 5130 { 5131 conn_t *connp = (conn_t *)proto_handle; 5132 int error; 5133 5134 /* All Solaris components should pass a cred for this operation. */ 5135 ASSERT(cr != NULL); 5136 5137 /* Binding to a NULL address really means unbind */ 5138 if (sa == NULL) 5139 error = rawip_do_unbind(connp); 5140 else 5141 error = rawip_do_bind(connp, sa, len); 5142 5143 if (error < 0) { 5144 if (error == -TOUTSTATE) 5145 error = EINVAL; 5146 else 5147 error = proto_tlitosyserr(-error); 5148 } 5149 return (error); 5150 } 5151 5152 static int 5153 rawip_implicit_bind(conn_t *connp) 5154 { 5155 sin6_t sin6addr; 5156 sin_t *sin; 5157 sin6_t *sin6; 5158 socklen_t len; 5159 int error; 5160 5161 if (connp->conn_family == AF_INET) { 5162 len = sizeof (struct sockaddr_in); 5163 sin = (sin_t *)&sin6addr; 5164 *sin = sin_null; 5165 sin->sin_family = AF_INET; 5166 sin->sin_addr.s_addr = INADDR_ANY; 5167 } else { 5168 ASSERT(connp->conn_family == AF_INET6); 5169 len = sizeof (sin6_t); 5170 sin6 = (sin6_t *)&sin6addr; 5171 *sin6 = sin6_null; 5172 sin6->sin6_family = AF_INET6; 5173 V6_SET_ZERO(sin6->sin6_addr); 5174 } 5175 5176 error = rawip_do_bind(connp, (struct sockaddr *)&sin6addr, len); 5177 5178 return ((error < 0) ? proto_tlitosyserr(-error) : error); 5179 } 5180 5181 static int 5182 rawip_unbind(conn_t *connp) 5183 { 5184 int error; 5185 5186 error = rawip_do_unbind(connp); 5187 if (error < 0) { 5188 error = proto_tlitosyserr(-error); 5189 } 5190 return (error); 5191 } 5192 5193 /* ARGSUSED */ 5194 int 5195 rawip_listen(sock_lower_handle_t proto_handle, int backlog, cred_t *cr) 5196 { 5197 return (EOPNOTSUPP); 5198 } 5199 5200 int 5201 rawip_connect(sock_lower_handle_t proto_handle, const struct sockaddr *sa, 5202 socklen_t len, sock_connid_t *id, cred_t *cr) 5203 { 5204 conn_t *connp = (conn_t *)proto_handle; 5205 icmp_t *icmp = connp->conn_icmp; 5206 int error; 5207 boolean_t did_bind = B_FALSE; 5208 pid_t pid = curproc->p_pid; 5209 5210 /* All Solaris components should pass a cred for this operation. */ 5211 ASSERT(cr != NULL); 5212 5213 if (sa == NULL) { 5214 /* 5215 * Disconnect 5216 * Make sure we are connected 5217 */ 5218 if (icmp->icmp_state != TS_DATA_XFER) 5219 return (EINVAL); 5220 5221 error = icmp_disconnect(connp); 5222 return (error); 5223 } 5224 5225 error = proto_verify_ip_addr(connp->conn_family, sa, len); 5226 if (error != 0) 5227 return (error); 5228 5229 /* do an implicit bind if necessary */ 5230 if (icmp->icmp_state == TS_UNBND) { 5231 error = rawip_implicit_bind(connp); 5232 /* 5233 * We could be racing with an actual bind, in which case 5234 * we would see EPROTO. We cross our fingers and try 5235 * to connect. 5236 */ 5237 if (!(error == 0 || error == EPROTO)) 5238 return (error); 5239 did_bind = B_TRUE; 5240 } 5241 5242 /* 5243 * set SO_DGRAM_ERRIND 5244 */ 5245 connp->conn_dgram_errind = B_TRUE; 5246 5247 error = rawip_do_connect(connp, sa, len, cr, pid); 5248 if (error != 0 && did_bind) { 5249 int unbind_err; 5250 5251 unbind_err = rawip_unbind(connp); 5252 ASSERT(unbind_err == 0); 5253 } 5254 5255 if (error == 0) { 5256 *id = 0; 5257 (*connp->conn_upcalls->su_connected)(connp->conn_upper_handle, 5258 0, NULL, -1); 5259 } else if (error < 0) { 5260 error = proto_tlitosyserr(-error); 5261 } 5262 return (error); 5263 } 5264 5265 /* ARGSUSED2 */ 5266 int 5267 rawip_fallback(sock_lower_handle_t proto_handle, queue_t *q, 5268 boolean_t direct_sockfs, so_proto_quiesced_cb_t quiesced_cb, 5269 sock_quiesce_arg_t *arg) 5270 { 5271 conn_t *connp = (conn_t *)proto_handle; 5272 icmp_t *icmp; 5273 struct T_capability_ack tca; 5274 struct sockaddr_in6 laddr, faddr; 5275 socklen_t laddrlen, faddrlen; 5276 short opts; 5277 struct stroptions *stropt; 5278 mblk_t *mp, *stropt_mp; 5279 int error; 5280 5281 icmp = connp->conn_icmp; 5282 5283 stropt_mp = allocb_wait(sizeof (*stropt), BPRI_HI, STR_NOSIG, NULL); 5284 5285 /* 5286 * setup the fallback stream that was allocated 5287 */ 5288 connp->conn_dev = (dev_t)RD(q)->q_ptr; 5289 connp->conn_minor_arena = WR(q)->q_ptr; 5290 5291 RD(q)->q_ptr = WR(q)->q_ptr = connp; 5292 5293 WR(q)->q_qinfo = &icmpwinit; 5294 5295 connp->conn_rq = RD(q); 5296 connp->conn_wq = WR(q); 5297 5298 /* Notify stream head about options before sending up data */ 5299 stropt_mp->b_datap->db_type = M_SETOPTS; 5300 stropt_mp->b_wptr += sizeof (*stropt); 5301 stropt = (struct stroptions *)stropt_mp->b_rptr; 5302 stropt->so_flags = SO_WROFF | SO_HIWAT; 5303 stropt->so_wroff = connp->conn_wroff; 5304 stropt->so_hiwat = connp->conn_rcvbuf; 5305 putnext(RD(q), stropt_mp); 5306 5307 /* 5308 * free helper stream 5309 */ 5310 ip_free_helper_stream(connp); 5311 5312 /* 5313 * Collect the information needed to sync with the sonode 5314 */ 5315 icmp_do_capability_ack(icmp, &tca, TC1_INFO); 5316 5317 laddrlen = faddrlen = sizeof (sin6_t); 5318 (void) rawip_getsockname((sock_lower_handle_t)connp, 5319 (struct sockaddr *)&laddr, &laddrlen, CRED()); 5320 error = rawip_getpeername((sock_lower_handle_t)connp, 5321 (struct sockaddr *)&faddr, &faddrlen, CRED()); 5322 if (error != 0) 5323 faddrlen = 0; 5324 opts = 0; 5325 if (connp->conn_dgram_errind) 5326 opts |= SO_DGRAM_ERRIND; 5327 if (connp->conn_ixa->ixa_flags & IXAF_DONTROUTE) 5328 opts |= SO_DONTROUTE; 5329 5330 mp = (*quiesced_cb)(connp->conn_upper_handle, arg, &tca, 5331 (struct sockaddr *)&laddr, laddrlen, 5332 (struct sockaddr *)&faddr, faddrlen, opts); 5333 5334 /* 5335 * Attempts to send data up during fallback will result in it being 5336 * queued in icmp_t. Now we push up any queued packets. 5337 */ 5338 mutex_enter(&icmp->icmp_recv_lock); 5339 if (mp != NULL) { 5340 mp->b_next = icmp->icmp_fallback_queue_head; 5341 icmp->icmp_fallback_queue_head = mp; 5342 } 5343 while (icmp->icmp_fallback_queue_head != NULL) { 5344 mp = icmp->icmp_fallback_queue_head; 5345 icmp->icmp_fallback_queue_head = mp->b_next; 5346 mp->b_next = NULL; 5347 mutex_exit(&icmp->icmp_recv_lock); 5348 putnext(RD(q), mp); 5349 mutex_enter(&icmp->icmp_recv_lock); 5350 } 5351 icmp->icmp_fallback_queue_tail = icmp->icmp_fallback_queue_head; 5352 5353 /* 5354 * No longer a streams less socket 5355 */ 5356 mutex_enter(&connp->conn_lock); 5357 connp->conn_flags &= ~IPCL_NONSTR; 5358 mutex_exit(&connp->conn_lock); 5359 5360 mutex_exit(&icmp->icmp_recv_lock); 5361 5362 ASSERT(icmp->icmp_fallback_queue_head == NULL && 5363 icmp->icmp_fallback_queue_tail == NULL); 5364 5365 ASSERT(connp->conn_ref >= 1); 5366 5367 return (0); 5368 } 5369 5370 /* ARGSUSED2 */ 5371 sock_lower_handle_t 5372 rawip_create(int family, int type, int proto, sock_downcalls_t **sock_downcalls, 5373 uint_t *smodep, int *errorp, int flags, cred_t *credp) 5374 { 5375 conn_t *connp; 5376 5377 if (type != SOCK_RAW || (family != AF_INET && family != AF_INET6)) { 5378 *errorp = EPROTONOSUPPORT; 5379 return (NULL); 5380 } 5381 5382 connp = rawip_do_open(family, credp, errorp, flags); 5383 if (connp != NULL) { 5384 connp->conn_flags |= IPCL_NONSTR; 5385 5386 mutex_enter(&connp->conn_lock); 5387 connp->conn_state_flags &= ~CONN_INCIPIENT; 5388 mutex_exit(&connp->conn_lock); 5389 *sock_downcalls = &sock_rawip_downcalls; 5390 *smodep = SM_ATOMIC; 5391 } else { 5392 ASSERT(*errorp != 0); 5393 } 5394 5395 return ((sock_lower_handle_t)connp); 5396 } 5397 5398 /* ARGSUSED3 */ 5399 void 5400 rawip_activate(sock_lower_handle_t proto_handle, 5401 sock_upper_handle_t sock_handle, sock_upcalls_t *sock_upcalls, int flags, 5402 cred_t *cr) 5403 { 5404 conn_t *connp = (conn_t *)proto_handle; 5405 struct sock_proto_props sopp; 5406 5407 /* All Solaris components should pass a cred for this operation. */ 5408 ASSERT(cr != NULL); 5409 5410 connp->conn_upcalls = sock_upcalls; 5411 connp->conn_upper_handle = sock_handle; 5412 5413 sopp.sopp_flags = SOCKOPT_WROFF | SOCKOPT_RCVHIWAT | SOCKOPT_RCVLOWAT | 5414 SOCKOPT_MAXBLK | SOCKOPT_MAXPSZ | SOCKOPT_MINPSZ; 5415 sopp.sopp_wroff = connp->conn_wroff; 5416 sopp.sopp_rxhiwat = connp->conn_rcvbuf; 5417 sopp.sopp_rxlowat = connp->conn_rcvlowat; 5418 sopp.sopp_maxblk = INFPSZ; 5419 sopp.sopp_maxpsz = IP_MAXPACKET; 5420 sopp.sopp_minpsz = (icmp_mod_info.mi_minpsz == 1) ? 0 : 5421 icmp_mod_info.mi_minpsz; 5422 5423 (*connp->conn_upcalls->su_set_proto_props) 5424 (connp->conn_upper_handle, &sopp); 5425 5426 icmp_bind_proto(connp->conn_icmp); 5427 } 5428 5429 /* ARGSUSED3 */ 5430 int 5431 rawip_getpeername(sock_lower_handle_t proto_handle, struct sockaddr *sa, 5432 socklen_t *salenp, cred_t *cr) 5433 { 5434 conn_t *connp = (conn_t *)proto_handle; 5435 icmp_t *icmp = connp->conn_icmp; 5436 int error; 5437 5438 /* All Solaris components should pass a cred for this operation. */ 5439 ASSERT(cr != NULL); 5440 5441 mutex_enter(&connp->conn_lock); 5442 if (icmp->icmp_state != TS_DATA_XFER) 5443 error = ENOTCONN; 5444 else 5445 error = conn_getpeername(connp, sa, salenp); 5446 mutex_exit(&connp->conn_lock); 5447 return (error); 5448 } 5449 5450 /* ARGSUSED3 */ 5451 int 5452 rawip_getsockname(sock_lower_handle_t proto_handle, struct sockaddr *sa, 5453 socklen_t *salenp, cred_t *cr) 5454 { 5455 conn_t *connp = (conn_t *)proto_handle; 5456 int error; 5457 5458 /* All Solaris components should pass a cred for this operation. */ 5459 ASSERT(cr != NULL); 5460 5461 mutex_enter(&connp->conn_lock); 5462 error = conn_getsockname(connp, sa, salenp); 5463 mutex_exit(&connp->conn_lock); 5464 return (error); 5465 } 5466 5467 int 5468 rawip_setsockopt(sock_lower_handle_t proto_handle, int level, int option_name, 5469 const void *optvalp, socklen_t optlen, cred_t *cr) 5470 { 5471 conn_t *connp = (conn_t *)proto_handle; 5472 int error; 5473 5474 /* All Solaris components should pass a cred for this operation. */ 5475 ASSERT(cr != NULL); 5476 5477 error = proto_opt_check(level, option_name, optlen, NULL, 5478 icmp_opt_obj.odb_opt_des_arr, 5479 icmp_opt_obj.odb_opt_arr_cnt, 5480 B_TRUE, B_FALSE, cr); 5481 5482 if (error != 0) { 5483 /* 5484 * option not recognized 5485 */ 5486 if (error < 0) { 5487 error = proto_tlitosyserr(-error); 5488 } 5489 return (error); 5490 } 5491 5492 error = icmp_opt_set(connp, SETFN_OPTCOM_NEGOTIATE, level, 5493 option_name, optlen, (uchar_t *)optvalp, (uint_t *)&optlen, 5494 (uchar_t *)optvalp, NULL, cr); 5495 5496 ASSERT(error >= 0); 5497 5498 return (error); 5499 } 5500 5501 int 5502 rawip_getsockopt(sock_lower_handle_t proto_handle, int level, int option_name, 5503 void *optvalp, socklen_t *optlen, cred_t *cr) 5504 { 5505 int error; 5506 conn_t *connp = (conn_t *)proto_handle; 5507 t_uscalar_t max_optbuf_len; 5508 void *optvalp_buf; 5509 int len; 5510 5511 /* All Solaris components should pass a cred for this operation. */ 5512 ASSERT(cr != NULL); 5513 5514 error = proto_opt_check(level, option_name, *optlen, &max_optbuf_len, 5515 icmp_opt_obj.odb_opt_des_arr, 5516 icmp_opt_obj.odb_opt_arr_cnt, 5517 B_FALSE, B_TRUE, cr); 5518 5519 if (error != 0) { 5520 if (error < 0) { 5521 error = proto_tlitosyserr(-error); 5522 } 5523 return (error); 5524 } 5525 5526 optvalp_buf = kmem_alloc(max_optbuf_len, KM_SLEEP); 5527 len = icmp_opt_get(connp, level, option_name, optvalp_buf); 5528 if (len == -1) { 5529 kmem_free(optvalp_buf, max_optbuf_len); 5530 return (EINVAL); 5531 } 5532 5533 /* 5534 * update optlen and copy option value 5535 */ 5536 t_uscalar_t size = MIN(len, *optlen); 5537 5538 bcopy(optvalp_buf, optvalp, size); 5539 bcopy(&size, optlen, sizeof (size)); 5540 5541 kmem_free(optvalp_buf, max_optbuf_len); 5542 return (0); 5543 } 5544 5545 /* ARGSUSED1 */ 5546 int 5547 rawip_close(sock_lower_handle_t proto_handle, int flags, cred_t *cr) 5548 { 5549 conn_t *connp = (conn_t *)proto_handle; 5550 5551 /* All Solaris components should pass a cred for this operation. */ 5552 ASSERT(cr != NULL); 5553 5554 (void) rawip_do_close(connp); 5555 return (0); 5556 } 5557 5558 /* ARGSUSED2 */ 5559 int 5560 rawip_shutdown(sock_lower_handle_t proto_handle, int how, cred_t *cr) 5561 { 5562 conn_t *connp = (conn_t *)proto_handle; 5563 5564 /* All Solaris components should pass a cred for this operation. */ 5565 ASSERT(cr != NULL); 5566 5567 /* shut down the send side */ 5568 if (how != SHUT_RD) 5569 (*connp->conn_upcalls->su_opctl)(connp->conn_upper_handle, 5570 SOCK_OPCTL_SHUT_SEND, 0); 5571 /* shut down the recv side */ 5572 if (how != SHUT_WR) 5573 (*connp->conn_upcalls->su_opctl)(connp->conn_upper_handle, 5574 SOCK_OPCTL_SHUT_RECV, 0); 5575 return (0); 5576 } 5577 5578 void 5579 rawip_clr_flowctrl(sock_lower_handle_t proto_handle) 5580 { 5581 conn_t *connp = (conn_t *)proto_handle; 5582 icmp_t *icmp = connp->conn_icmp; 5583 5584 mutex_enter(&icmp->icmp_recv_lock); 5585 connp->conn_flow_cntrld = B_FALSE; 5586 mutex_exit(&icmp->icmp_recv_lock); 5587 } 5588 5589 int 5590 rawip_ioctl(sock_lower_handle_t proto_handle, int cmd, intptr_t arg, 5591 int mode, int32_t *rvalp, cred_t *cr) 5592 { 5593 conn_t *connp = (conn_t *)proto_handle; 5594 int error; 5595 5596 /* All Solaris components should pass a cred for this operation. */ 5597 ASSERT(cr != NULL); 5598 5599 /* 5600 * If we don't have a helper stream then create one. 5601 * ip_create_helper_stream takes care of locking the conn_t, 5602 * so this check for NULL is just a performance optimization. 5603 */ 5604 if (connp->conn_helper_info == NULL) { 5605 icmp_stack_t *is = connp->conn_icmp->icmp_is; 5606 5607 ASSERT(is->is_ldi_ident != NULL); 5608 5609 /* 5610 * Create a helper stream for non-STREAMS socket. 5611 */ 5612 error = ip_create_helper_stream(connp, is->is_ldi_ident); 5613 if (error != 0) { 5614 ip0dbg(("rawip_ioctl: create of IP helper stream " 5615 "failed %d\n", error)); 5616 return (error); 5617 } 5618 } 5619 5620 switch (cmd) { 5621 case _SIOCSOCKFALLBACK: 5622 case TI_GETPEERNAME: 5623 case TI_GETMYNAME: 5624 #ifdef DEBUG 5625 cmn_err(CE_CONT, "icmp_ioctl cmd 0x%x on non streams" 5626 " socket", cmd); 5627 #endif 5628 error = EINVAL; 5629 break; 5630 default: 5631 /* 5632 * Pass on to IP using helper stream 5633 */ 5634 error = ldi_ioctl(connp->conn_helper_info->iphs_handle, 5635 cmd, arg, mode, cr, rvalp); 5636 break; 5637 } 5638 return (error); 5639 } 5640 5641 int 5642 rawip_send(sock_lower_handle_t proto_handle, mblk_t *mp, struct nmsghdr *msg, 5643 cred_t *cr) 5644 { 5645 sin6_t *sin6; 5646 sin_t *sin = NULL; 5647 uint_t srcid; 5648 conn_t *connp = (conn_t *)proto_handle; 5649 icmp_t *icmp = connp->conn_icmp; 5650 int error = 0; 5651 icmp_stack_t *is = icmp->icmp_is; 5652 pid_t pid = curproc->p_pid; 5653 ip_xmit_attr_t *ixa; 5654 5655 ASSERT(DB_TYPE(mp) == M_DATA); 5656 5657 /* All Solaris components should pass a cred for this operation. */ 5658 ASSERT(cr != NULL); 5659 5660 /* do an implicit bind if necessary */ 5661 if (icmp->icmp_state == TS_UNBND) { 5662 error = rawip_implicit_bind(connp); 5663 /* 5664 * We could be racing with an actual bind, in which case 5665 * we would see EPROTO. We cross our fingers and try 5666 * to connect. 5667 */ 5668 if (!(error == 0 || error == EPROTO)) { 5669 freemsg(mp); 5670 return (error); 5671 } 5672 } 5673 5674 /* Protocol 255 contains full IP headers */ 5675 /* Read without holding lock */ 5676 if (icmp->icmp_hdrincl) { 5677 ASSERT(connp->conn_ipversion == IPV4_VERSION); 5678 if (mp->b_wptr - mp->b_rptr < IP_SIMPLE_HDR_LENGTH) { 5679 if (!pullupmsg(mp, IP_SIMPLE_HDR_LENGTH)) { 5680 BUMP_MIB(&is->is_rawip_mib, rawipOutErrors); 5681 freemsg(mp); 5682 return (EINVAL); 5683 } 5684 } 5685 error = icmp_output_hdrincl(connp, mp, cr, pid); 5686 if (is->is_sendto_ignerr) 5687 return (0); 5688 else 5689 return (error); 5690 } 5691 5692 /* Connected? */ 5693 if (msg->msg_name == NULL) { 5694 if (icmp->icmp_state != TS_DATA_XFER) { 5695 BUMP_MIB(&is->is_rawip_mib, rawipOutErrors); 5696 return (EDESTADDRREQ); 5697 } 5698 if (msg->msg_controllen != 0) { 5699 error = icmp_output_ancillary(connp, NULL, NULL, mp, 5700 NULL, msg, cr, pid); 5701 } else { 5702 error = icmp_output_connected(connp, mp, cr, pid); 5703 } 5704 if (is->is_sendto_ignerr) 5705 return (0); 5706 else 5707 return (error); 5708 } 5709 if (icmp->icmp_state == TS_DATA_XFER) { 5710 BUMP_MIB(&is->is_rawip_mib, rawipOutErrors); 5711 return (EISCONN); 5712 } 5713 error = proto_verify_ip_addr(connp->conn_family, 5714 (struct sockaddr *)msg->msg_name, msg->msg_namelen); 5715 if (error != 0) { 5716 BUMP_MIB(&is->is_rawip_mib, rawipOutErrors); 5717 return (error); 5718 } 5719 switch (connp->conn_family) { 5720 case AF_INET6: 5721 sin6 = (sin6_t *)msg->msg_name; 5722 5723 /* No support for mapped addresses on raw sockets */ 5724 if (IN6_IS_ADDR_V4MAPPED(&sin6->sin6_addr)) { 5725 BUMP_MIB(&is->is_rawip_mib, rawipOutErrors); 5726 return (EADDRNOTAVAIL); 5727 } 5728 srcid = sin6->__sin6_src_id; 5729 5730 /* 5731 * If the local address is a mapped address return 5732 * an error. 5733 * It would be possible to send an IPv6 packet but the 5734 * response would never make it back to the application 5735 * since it is bound to a mapped address. 5736 */ 5737 if (IN6_IS_ADDR_V4MAPPED(&connp->conn_saddr_v6)) { 5738 BUMP_MIB(&is->is_rawip_mib, rawipOutErrors); 5739 return (EADDRNOTAVAIL); 5740 } 5741 5742 if (IN6_IS_ADDR_UNSPECIFIED(&sin6->sin6_addr)) 5743 sin6->sin6_addr = ipv6_loopback; 5744 5745 /* 5746 * We have to allocate an ip_xmit_attr_t before we grab 5747 * conn_lock and we need to hold conn_lock once we've check 5748 * conn_same_as_last_v6 to handle concurrent send* calls on a 5749 * socket. 5750 */ 5751 if (msg->msg_controllen == 0) { 5752 ixa = conn_get_ixa(connp, B_FALSE); 5753 if (ixa == NULL) { 5754 BUMP_MIB(&is->is_rawip_mib, rawipOutErrors); 5755 return (ENOMEM); 5756 } 5757 } else { 5758 ixa = NULL; 5759 } 5760 mutex_enter(&connp->conn_lock); 5761 if (icmp->icmp_delayed_error != 0) { 5762 sin6_t *sin2 = (sin6_t *)&icmp->icmp_delayed_addr; 5763 5764 error = icmp->icmp_delayed_error; 5765 icmp->icmp_delayed_error = 0; 5766 5767 /* Compare IP address and family */ 5768 5769 if (IN6_ARE_ADDR_EQUAL(&sin6->sin6_addr, 5770 &sin2->sin6_addr) && 5771 sin6->sin6_family == sin2->sin6_family) { 5772 mutex_exit(&connp->conn_lock); 5773 BUMP_MIB(&is->is_rawip_mib, rawipOutErrors); 5774 if (ixa != NULL) 5775 ixa_refrele(ixa); 5776 return (error); 5777 } 5778 } 5779 if (msg->msg_controllen != 0) { 5780 mutex_exit(&connp->conn_lock); 5781 ASSERT(ixa == NULL); 5782 error = icmp_output_ancillary(connp, NULL, sin6, mp, 5783 NULL, msg, cr, pid); 5784 } else if (conn_same_as_last_v6(connp, sin6) && 5785 connp->conn_lastsrcid == srcid && 5786 ipsec_outbound_policy_current(ixa)) { 5787 /* icmp_output_lastdst drops conn_lock */ 5788 error = icmp_output_lastdst(connp, mp, cr, pid, ixa); 5789 } else { 5790 /* icmp_output_newdst drops conn_lock */ 5791 error = icmp_output_newdst(connp, mp, NULL, sin6, cr, 5792 pid, ixa); 5793 } 5794 ASSERT(MUTEX_NOT_HELD(&connp->conn_lock)); 5795 if (is->is_sendto_ignerr) 5796 return (0); 5797 else 5798 return (error); 5799 case AF_INET: 5800 sin = (sin_t *)msg->msg_name; 5801 5802 if (sin->sin_addr.s_addr == INADDR_ANY) 5803 sin->sin_addr.s_addr = htonl(INADDR_LOOPBACK); 5804 5805 /* 5806 * We have to allocate an ip_xmit_attr_t before we grab 5807 * conn_lock and we need to hold conn_lock once we've check 5808 * conn_same_as_last_v6 to handle concurrent send* on a socket. 5809 */ 5810 if (msg->msg_controllen == 0) { 5811 ixa = conn_get_ixa(connp, B_FALSE); 5812 if (ixa == NULL) { 5813 BUMP_MIB(&is->is_rawip_mib, rawipOutErrors); 5814 return (ENOMEM); 5815 } 5816 } else { 5817 ixa = NULL; 5818 } 5819 mutex_enter(&connp->conn_lock); 5820 if (icmp->icmp_delayed_error != 0) { 5821 sin_t *sin2 = (sin_t *)&icmp->icmp_delayed_addr; 5822 5823 error = icmp->icmp_delayed_error; 5824 icmp->icmp_delayed_error = 0; 5825 5826 /* Compare IP address */ 5827 5828 if (sin->sin_addr.s_addr == sin2->sin_addr.s_addr) { 5829 mutex_exit(&connp->conn_lock); 5830 BUMP_MIB(&is->is_rawip_mib, rawipOutErrors); 5831 if (ixa != NULL) 5832 ixa_refrele(ixa); 5833 return (error); 5834 } 5835 } 5836 5837 if (msg->msg_controllen != 0) { 5838 mutex_exit(&connp->conn_lock); 5839 ASSERT(ixa == NULL); 5840 error = icmp_output_ancillary(connp, sin, NULL, mp, 5841 NULL, msg, cr, pid); 5842 } else if (conn_same_as_last_v4(connp, sin) && 5843 ipsec_outbound_policy_current(ixa)) { 5844 /* icmp_output_lastdst drops conn_lock */ 5845 error = icmp_output_lastdst(connp, mp, cr, pid, ixa); 5846 } else { 5847 /* icmp_output_newdst drops conn_lock */ 5848 error = icmp_output_newdst(connp, mp, sin, NULL, cr, 5849 pid, ixa); 5850 } 5851 ASSERT(MUTEX_NOT_HELD(&connp->conn_lock)); 5852 if (is->is_sendto_ignerr) 5853 return (0); 5854 else 5855 return (error); 5856 default: 5857 return (EINVAL); 5858 } 5859 } 5860 5861 sock_downcalls_t sock_rawip_downcalls = { 5862 rawip_activate, 5863 rawip_accept, 5864 rawip_bind, 5865 rawip_listen, 5866 rawip_connect, 5867 rawip_getpeername, 5868 rawip_getsockname, 5869 rawip_getsockopt, 5870 rawip_setsockopt, 5871 rawip_send, 5872 NULL, 5873 NULL, 5874 NULL, 5875 rawip_shutdown, 5876 rawip_clr_flowctrl, 5877 rawip_ioctl, 5878 rawip_close 5879 }; 5880