1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright (c) 1991, 2010, Oracle and/or its affiliates. All rights reserved. 23 * Copyright (c) 2013 by Delphix. All rights reserved. 24 * Copyright 2014, OmniTI Computer Consulting, Inc. All rights reserved. 25 * Copyright (c) 2018, Joyent, Inc. 26 * Copyright 2024 Oxide Computer Company 27 */ 28 /* Copyright (c) 1990 Mentat Inc. */ 29 30 #include <sys/types.h> 31 #include <sys/stream.h> 32 #include <sys/stropts.h> 33 #include <sys/strlog.h> 34 #include <sys/strsun.h> 35 #define _SUN_TPI_VERSION 2 36 #include <sys/tihdr.h> 37 #include <sys/timod.h> 38 #include <sys/ddi.h> 39 #include <sys/sunddi.h> 40 #include <sys/strsubr.h> 41 #include <sys/suntpi.h> 42 #include <sys/xti_inet.h> 43 #include <sys/cmn_err.h> 44 #include <sys/kmem.h> 45 #include <sys/cred.h> 46 #include <sys/policy.h> 47 #include <sys/priv.h> 48 #include <sys/ucred.h> 49 #include <sys/zone.h> 50 51 #include <sys/sockio.h> 52 #include <sys/socket.h> 53 #include <sys/socketvar.h> 54 #include <sys/vtrace.h> 55 #include <sys/sdt.h> 56 #include <sys/debug.h> 57 #include <sys/isa_defs.h> 58 #include <sys/random.h> 59 #include <netinet/in.h> 60 #include <netinet/ip6.h> 61 #include <netinet/icmp6.h> 62 #include <netinet/udp.h> 63 64 #include <inet/common.h> 65 #include <inet/ip.h> 66 #include <inet/ip_impl.h> 67 #include <inet/ipsec_impl.h> 68 #include <inet/ip6.h> 69 #include <inet/ip_ire.h> 70 #include <inet/ip_if.h> 71 #include <inet/ip_multi.h> 72 #include <inet/ip_ndp.h> 73 #include <inet/proto_set.h> 74 #include <inet/mib2.h> 75 #include <inet/nd.h> 76 #include <inet/optcom.h> 77 #include <inet/snmpcom.h> 78 #include <inet/kstatcom.h> 79 #include <inet/ipclassifier.h> 80 81 #include <sys/tsol/label.h> 82 #include <sys/tsol/tnet.h> 83 84 #include <inet/rawip_impl.h> 85 86 #include <sys/disp.h> 87 88 /* 89 * Synchronization notes: 90 * 91 * RAWIP is MT and uses the usual kernel synchronization primitives. We use 92 * conn_lock to protect the icmp_t. 93 * 94 * Plumbing notes: 95 * ICMP is always a device driver. For compatibility with mibopen() code 96 * it is possible to I_PUSH "icmp", but that results in pushing a passthrough 97 * dummy module. 98 */ 99 static void icmp_addr_req(queue_t *q, mblk_t *mp); 100 static void icmp_tpi_bind(queue_t *q, mblk_t *mp); 101 static void icmp_bind_proto(icmp_t *icmp); 102 static int icmp_build_hdr_template(conn_t *, const in6_addr_t *, 103 const in6_addr_t *, uint32_t); 104 static void icmp_capability_req(queue_t *q, mblk_t *mp); 105 static int icmp_close(queue_t *q, int flags, cred_t *); 106 static void icmp_close_free(conn_t *); 107 static void icmp_tpi_connect(queue_t *q, mblk_t *mp); 108 static void icmp_tpi_disconnect(queue_t *q, mblk_t *mp); 109 static void icmp_err_ack(queue_t *q, mblk_t *mp, t_scalar_t t_error, 110 int sys_error); 111 static void icmp_err_ack_prim(queue_t *q, mblk_t *mp, t_scalar_t primitive, 112 t_scalar_t tlierr, int sys_error); 113 static void icmp_icmp_input(void *arg1, mblk_t *mp, void *arg2, 114 ip_recv_attr_t *); 115 static void icmp_icmp_error_ipv6(conn_t *connp, mblk_t *mp, 116 ip_recv_attr_t *); 117 static void icmp_info_req(queue_t *q, mblk_t *mp); 118 static void icmp_input(void *, mblk_t *, void *, ip_recv_attr_t *); 119 static conn_t *icmp_open(int family, cred_t *credp, int *err, int flags); 120 static int icmp_openv4(queue_t *q, dev_t *devp, int flag, int sflag, 121 cred_t *credp); 122 static int icmp_openv6(queue_t *q, dev_t *devp, int flag, int sflag, 123 cred_t *credp); 124 static boolean_t icmp_opt_allow_udr_set(t_scalar_t level, t_scalar_t name); 125 int icmp_opt_set(conn_t *connp, uint_t optset_context, 126 int level, int name, uint_t inlen, 127 uchar_t *invalp, uint_t *outlenp, uchar_t *outvalp, 128 void *thisdg_attrs, cred_t *cr); 129 int icmp_opt_get(conn_t *connp, int level, int name, 130 uchar_t *ptr); 131 static int icmp_output_newdst(conn_t *connp, mblk_t *data_mp, sin_t *sin, 132 sin6_t *sin6, cred_t *cr, pid_t pid, ip_xmit_attr_t *ixa); 133 static mblk_t *icmp_prepend_hdr(conn_t *, ip_xmit_attr_t *, const ip_pkt_t *, 134 const in6_addr_t *, const in6_addr_t *, uint32_t, mblk_t *, int *); 135 static mblk_t *icmp_prepend_header_template(conn_t *, ip_xmit_attr_t *, 136 mblk_t *, const in6_addr_t *, uint32_t, int *); 137 static int icmp_snmp_set(queue_t *q, t_scalar_t level, t_scalar_t name, 138 uchar_t *ptr, int len); 139 static void icmp_ud_err(queue_t *q, mblk_t *mp, t_scalar_t err); 140 static void icmp_tpi_unbind(queue_t *q, mblk_t *mp); 141 static int icmp_wput(queue_t *q, mblk_t *mp); 142 static int icmp_wput_fallback(queue_t *q, mblk_t *mp); 143 static void icmp_wput_other(queue_t *q, mblk_t *mp); 144 static void icmp_wput_iocdata(queue_t *q, mblk_t *mp); 145 static void icmp_wput_restricted(queue_t *q, mblk_t *mp); 146 static void icmp_ulp_recv(conn_t *, mblk_t *, uint_t); 147 148 static void *rawip_stack_init(netstackid_t stackid, netstack_t *ns); 149 static void rawip_stack_fini(netstackid_t stackid, void *arg); 150 151 static void *rawip_kstat_init(netstackid_t stackid); 152 static void rawip_kstat_fini(netstackid_t stackid, kstat_t *ksp); 153 static int rawip_kstat_update(kstat_t *kp, int rw); 154 static void rawip_stack_shutdown(netstackid_t stackid, void *arg); 155 156 /* Common routines for TPI and socket module */ 157 static conn_t *rawip_do_open(int, cred_t *, int *, int); 158 static void rawip_do_close(conn_t *); 159 static int rawip_do_bind(conn_t *, struct sockaddr *, socklen_t); 160 static int rawip_do_unbind(conn_t *); 161 static int rawip_do_connect(conn_t *, const struct sockaddr *, socklen_t, 162 cred_t *, pid_t); 163 164 int rawip_getsockname(sock_lower_handle_t, struct sockaddr *, 165 socklen_t *, cred_t *); 166 int rawip_getpeername(sock_lower_handle_t, struct sockaddr *, 167 socklen_t *, cred_t *); 168 169 static struct module_info icmp_mod_info = { 170 5707, "icmp", 1, INFPSZ, 512, 128 171 }; 172 173 /* 174 * Entry points for ICMP as a device. 175 * We have separate open functions for the /dev/icmp and /dev/icmp6 devices. 176 */ 177 static struct qinit icmprinitv4 = { 178 NULL, NULL, icmp_openv4, icmp_close, NULL, &icmp_mod_info 179 }; 180 181 static struct qinit icmprinitv6 = { 182 NULL, NULL, icmp_openv6, icmp_close, NULL, &icmp_mod_info 183 }; 184 185 static struct qinit icmpwinit = { 186 icmp_wput, ip_wsrv, NULL, NULL, NULL, &icmp_mod_info 187 }; 188 189 /* ICMP entry point during fallback */ 190 static struct qinit icmp_fallback_sock_winit = { 191 icmp_wput_fallback, NULL, NULL, NULL, NULL, &icmp_mod_info 192 }; 193 194 /* For AF_INET aka /dev/icmp */ 195 struct streamtab icmpinfov4 = { 196 &icmprinitv4, &icmpwinit 197 }; 198 199 /* For AF_INET6 aka /dev/icmp6 */ 200 struct streamtab icmpinfov6 = { 201 &icmprinitv6, &icmpwinit 202 }; 203 204 /* Default structure copied into T_INFO_ACK messages */ 205 static struct T_info_ack icmp_g_t_info_ack = { 206 T_INFO_ACK, 207 IP_MAXPACKET, /* TSDU_size. icmp allows maximum size messages. */ 208 T_INVALID, /* ETSDU_size. icmp does not support expedited data. */ 209 T_INVALID, /* CDATA_size. icmp does not support connect data. */ 210 T_INVALID, /* DDATA_size. icmp does not support disconnect data. */ 211 0, /* ADDR_size - filled in later. */ 212 0, /* OPT_size - not initialized here */ 213 IP_MAXPACKET, /* TIDU_size. icmp allows maximum size messages. */ 214 T_CLTS, /* SERV_type. icmp supports connection-less. */ 215 TS_UNBND, /* CURRENT_state. This is set from icmp_state. */ 216 (XPG4_1|SENDZERO) /* PROVIDER_flag */ 217 }; 218 219 static int 220 icmp_set_buf_prop(netstack_t *stack, cred_t *cr, mod_prop_info_t *pinfo, 221 const char *ifname, const void *pval, uint_t flags) 222 { 223 return (mod_set_buf_prop(stack->netstack_icmp->is_propinfo_tbl, 224 stack, cr, pinfo, ifname, pval, flags)); 225 } 226 227 static int 228 icmp_get_buf_prop(netstack_t *stack, mod_prop_info_t *pinfo, const char *ifname, 229 void *val, uint_t psize, uint_t flags) 230 { 231 return (mod_get_buf_prop(stack->netstack_icmp->is_propinfo_tbl, stack, 232 pinfo, ifname, val, psize, flags)); 233 } 234 235 /* 236 * All of these are alterable, within the min/max values given, at run time. 237 * 238 * Note: All those tunables which do not start with "icmp_" are Committed and 239 * therefore are public. See PSARC 2010/080. 240 */ 241 static mod_prop_info_t icmp_propinfo_tbl[] = { 242 /* tunable - 0 */ 243 { "_wroff_extra", MOD_PROTO_RAWIP, 244 mod_set_uint32, mod_get_uint32, 245 {0, 128, 32}, {32} }, 246 247 { "_ipv4_ttl", MOD_PROTO_RAWIP, 248 mod_set_uint32, mod_get_uint32, 249 {1, 255, 255}, {255} }, 250 251 { "_ipv6_hoplimit", MOD_PROTO_RAWIP, 252 mod_set_uint32, mod_get_uint32, 253 {0, IPV6_MAX_HOPS, IPV6_DEFAULT_HOPS}, 254 {IPV6_DEFAULT_HOPS} }, 255 256 { "_bsd_compat", MOD_PROTO_RAWIP, 257 mod_set_boolean, mod_get_boolean, 258 {B_TRUE}, {B_TRUE} }, 259 260 { "send_buf", MOD_PROTO_RAWIP, 261 icmp_set_buf_prop, icmp_get_buf_prop, 262 {4096, 65536, 8192}, {8192} }, 263 264 { "_xmit_lowat", MOD_PROTO_RAWIP, 265 mod_set_uint32, mod_get_uint32, 266 {0, 65536, 1024}, {1024} }, 267 268 { "recv_buf", MOD_PROTO_RAWIP, 269 icmp_set_buf_prop, icmp_get_buf_prop, 270 {4096, 65536, 8192}, {8192} }, 271 272 { "max_buf", MOD_PROTO_RAWIP, 273 mod_set_uint32, mod_get_uint32, 274 {65536, ULP_MAX_BUF, 256*1024}, {256*1024} }, 275 276 { "_pmtu_discovery", MOD_PROTO_RAWIP, 277 mod_set_boolean, mod_get_boolean, 278 {B_FALSE}, {B_FALSE} }, 279 280 { "_sendto_ignerr", MOD_PROTO_RAWIP, 281 mod_set_boolean, mod_get_boolean, 282 {B_FALSE}, {B_FALSE} }, 283 284 { "?", MOD_PROTO_RAWIP, NULL, mod_get_allprop, {0}, {0} }, 285 286 { NULL, 0, NULL, NULL, {0}, {0} } 287 }; 288 289 #define is_wroff_extra is_propinfo_tbl[0].prop_cur_uval 290 #define is_ipv4_ttl is_propinfo_tbl[1].prop_cur_uval 291 #define is_ipv6_hoplimit is_propinfo_tbl[2].prop_cur_uval 292 #define is_bsd_compat is_propinfo_tbl[3].prop_cur_bval 293 #define is_xmit_hiwat is_propinfo_tbl[4].prop_cur_uval 294 #define is_xmit_lowat is_propinfo_tbl[5].prop_cur_uval 295 #define is_recv_hiwat is_propinfo_tbl[6].prop_cur_uval 296 #define is_max_buf is_propinfo_tbl[7].prop_cur_uval 297 #define is_pmtu_discovery is_propinfo_tbl[8].prop_cur_bval 298 #define is_sendto_ignerr is_propinfo_tbl[9].prop_cur_bval 299 300 typedef union T_primitives *t_primp_t; 301 302 /* 303 * This routine is called to handle each O_T_BIND_REQ/T_BIND_REQ message 304 * passed to icmp_wput. 305 * It calls IP to verify the local IP address, and calls IP to insert 306 * the conn_t in the fanout table. 307 * If everything is ok it then sends the T_BIND_ACK back up. 308 */ 309 static void 310 icmp_tpi_bind(queue_t *q, mblk_t *mp) 311 { 312 int error; 313 struct sockaddr *sa; 314 struct T_bind_req *tbr; 315 socklen_t len; 316 sin_t *sin; 317 sin6_t *sin6; 318 icmp_t *icmp; 319 conn_t *connp = Q_TO_CONN(q); 320 mblk_t *mp1; 321 cred_t *cr; 322 323 /* 324 * All Solaris components should pass a db_credp 325 * for this TPI message, hence we ASSERT. 326 * But in case there is some other M_PROTO that looks 327 * like a TPI message sent by some other kernel 328 * component, we check and return an error. 329 */ 330 cr = msg_getcred(mp, NULL); 331 ASSERT(cr != NULL); 332 if (cr == NULL) { 333 icmp_err_ack(q, mp, TSYSERR, EINVAL); 334 return; 335 } 336 337 icmp = connp->conn_icmp; 338 if ((mp->b_wptr - mp->b_rptr) < sizeof (*tbr)) { 339 (void) mi_strlog(q, 1, SL_ERROR|SL_TRACE, 340 "icmp_bind: bad req, len %u", 341 (uint_t)(mp->b_wptr - mp->b_rptr)); 342 icmp_err_ack(q, mp, TPROTO, 0); 343 return; 344 } 345 346 if (icmp->icmp_state != TS_UNBND) { 347 (void) mi_strlog(q, 1, SL_ERROR|SL_TRACE, 348 "icmp_bind: bad state, %u", icmp->icmp_state); 349 icmp_err_ack(q, mp, TOUTSTATE, 0); 350 return; 351 } 352 353 /* 354 * Reallocate the message to make sure we have enough room for an 355 * address. 356 */ 357 mp1 = reallocb(mp, sizeof (struct T_bind_ack) + sizeof (sin6_t), 1); 358 if (mp1 == NULL) { 359 icmp_err_ack(q, mp, TSYSERR, ENOMEM); 360 return; 361 } 362 mp = mp1; 363 364 /* Reset the message type in preparation for shipping it back. */ 365 DB_TYPE(mp) = M_PCPROTO; 366 tbr = (struct T_bind_req *)mp->b_rptr; 367 len = tbr->ADDR_length; 368 switch (len) { 369 case 0: /* request for a generic port */ 370 tbr->ADDR_offset = sizeof (struct T_bind_req); 371 if (connp->conn_family == AF_INET) { 372 tbr->ADDR_length = sizeof (sin_t); 373 sin = (sin_t *)&tbr[1]; 374 *sin = sin_null; 375 sin->sin_family = AF_INET; 376 mp->b_wptr = (uchar_t *)&sin[1]; 377 sa = (struct sockaddr *)sin; 378 len = sizeof (sin_t); 379 } else { 380 ASSERT(connp->conn_family == AF_INET6); 381 tbr->ADDR_length = sizeof (sin6_t); 382 sin6 = (sin6_t *)&tbr[1]; 383 *sin6 = sin6_null; 384 sin6->sin6_family = AF_INET6; 385 mp->b_wptr = (uchar_t *)&sin6[1]; 386 sa = (struct sockaddr *)sin6; 387 len = sizeof (sin6_t); 388 } 389 break; 390 391 case sizeof (sin_t): /* Complete IPv4 address */ 392 sa = (struct sockaddr *)mi_offset_param(mp, tbr->ADDR_offset, 393 sizeof (sin_t)); 394 break; 395 396 case sizeof (sin6_t): /* Complete IPv6 address */ 397 sa = (struct sockaddr *)mi_offset_param(mp, 398 tbr->ADDR_offset, sizeof (sin6_t)); 399 break; 400 401 default: 402 (void) mi_strlog(q, 1, SL_ERROR|SL_TRACE, 403 "icmp_bind: bad ADDR_length %u", tbr->ADDR_length); 404 icmp_err_ack(q, mp, TBADADDR, 0); 405 return; 406 } 407 408 error = rawip_do_bind(connp, sa, len); 409 if (error != 0) { 410 if (error > 0) { 411 icmp_err_ack(q, mp, TSYSERR, error); 412 } else { 413 icmp_err_ack(q, mp, -error, 0); 414 } 415 } else { 416 tbr->PRIM_type = T_BIND_ACK; 417 qreply(q, mp); 418 } 419 } 420 421 static int 422 rawip_do_bind(conn_t *connp, struct sockaddr *sa, socklen_t len) 423 { 424 sin_t *sin; 425 sin6_t *sin6; 426 icmp_t *icmp = connp->conn_icmp; 427 int error = 0; 428 ip_laddr_t laddr_type = IPVL_UNICAST_UP; /* INADDR_ANY */ 429 in_port_t lport; /* Network byte order */ 430 ipaddr_t v4src; /* Set if AF_INET */ 431 in6_addr_t v6src; 432 uint_t scopeid = 0; 433 zoneid_t zoneid = IPCL_ZONEID(connp); 434 ip_stack_t *ipst = connp->conn_netstack->netstack_ip; 435 436 if (sa == NULL || !OK_32PTR((char *)sa)) { 437 return (EINVAL); 438 } 439 440 switch (len) { 441 case sizeof (sin_t): /* Complete IPv4 address */ 442 sin = (sin_t *)sa; 443 if (sin->sin_family != AF_INET || 444 connp->conn_family != AF_INET) { 445 /* TSYSERR, EAFNOSUPPORT */ 446 return (EAFNOSUPPORT); 447 } 448 v4src = sin->sin_addr.s_addr; 449 IN6_IPADDR_TO_V4MAPPED(v4src, &v6src); 450 if (v4src != INADDR_ANY) { 451 laddr_type = ip_laddr_verify_v4(v4src, zoneid, ipst, 452 B_TRUE); 453 } 454 lport = sin->sin_port; 455 break; 456 case sizeof (sin6_t): /* Complete IPv6 address */ 457 sin6 = (sin6_t *)sa; 458 if (sin6->sin6_family != AF_INET6 || 459 connp->conn_family != AF_INET6) { 460 /* TSYSERR, EAFNOSUPPORT */ 461 return (EAFNOSUPPORT); 462 } 463 /* No support for mapped addresses on raw sockets */ 464 if (IN6_IS_ADDR_V4MAPPED(&sin6->sin6_addr)) { 465 /* TSYSERR, EADDRNOTAVAIL */ 466 return (EADDRNOTAVAIL); 467 } 468 v6src = sin6->sin6_addr; 469 if (!IN6_IS_ADDR_UNSPECIFIED(&v6src)) { 470 if (IN6_IS_ADDR_LINKSCOPE(&v6src)) 471 scopeid = sin6->sin6_scope_id; 472 laddr_type = ip_laddr_verify_v6(&v6src, zoneid, ipst, 473 B_TRUE, scopeid); 474 } 475 lport = sin6->sin6_port; 476 break; 477 478 default: 479 /* TBADADDR */ 480 return (EADDRNOTAVAIL); 481 } 482 483 /* Is the local address a valid unicast, multicast, or broadcast? */ 484 if (laddr_type == IPVL_BAD) 485 return (EADDRNOTAVAIL); 486 487 /* 488 * The state must be TS_UNBND. 489 */ 490 mutex_enter(&connp->conn_lock); 491 if (icmp->icmp_state != TS_UNBND) { 492 mutex_exit(&connp->conn_lock); 493 return (-TOUTSTATE); 494 } 495 496 /* 497 * Copy the source address into our icmp structure. This address 498 * may still be zero; if so, ip will fill in the correct address 499 * each time an outbound packet is passed to it. 500 * If we are binding to a broadcast or multicast address then 501 * we just set the conn_bound_addr since we don't want to use 502 * that as the source address when sending. 503 */ 504 connp->conn_bound_addr_v6 = v6src; 505 connp->conn_laddr_v6 = v6src; 506 if (scopeid != 0) { 507 connp->conn_ixa->ixa_flags |= IXAF_SCOPEID_SET; 508 connp->conn_ixa->ixa_scopeid = scopeid; 509 connp->conn_incoming_ifindex = scopeid; 510 } else { 511 connp->conn_ixa->ixa_flags &= ~IXAF_SCOPEID_SET; 512 connp->conn_incoming_ifindex = connp->conn_bound_if; 513 } 514 515 switch (laddr_type) { 516 case IPVL_UNICAST_UP: 517 case IPVL_UNICAST_DOWN: 518 connp->conn_saddr_v6 = v6src; 519 connp->conn_mcbc_bind = B_FALSE; 520 break; 521 case IPVL_MCAST: 522 case IPVL_BCAST: 523 /* ip_set_destination will pick a source address later */ 524 connp->conn_saddr_v6 = ipv6_all_zeros; 525 connp->conn_mcbc_bind = B_TRUE; 526 break; 527 } 528 529 /* Any errors after this point should use late_error */ 530 531 /* 532 * Use sin_port/sin6_port since applications like psh use SOCK_RAW 533 * with IPPROTO_TCP. 534 */ 535 connp->conn_lport = lport; 536 connp->conn_fport = 0; 537 538 if (connp->conn_family == AF_INET) { 539 ASSERT(connp->conn_ipversion == IPV4_VERSION); 540 } else { 541 ASSERT(connp->conn_ipversion == IPV6_VERSION); 542 } 543 544 icmp->icmp_state = TS_IDLE; 545 546 /* 547 * We create an initial header template here to make a subsequent 548 * sendto have a starting point. Since conn_last_dst is zero the 549 * first sendto will always follow the 'dst changed' code path. 550 * Note that we defer massaging options and the related checksum 551 * adjustment until we have a destination address. 552 */ 553 error = icmp_build_hdr_template(connp, &connp->conn_saddr_v6, 554 &connp->conn_faddr_v6, connp->conn_flowinfo); 555 if (error != 0) { 556 mutex_exit(&connp->conn_lock); 557 goto late_error; 558 } 559 /* Just in case */ 560 connp->conn_faddr_v6 = ipv6_all_zeros; 561 connp->conn_v6lastdst = ipv6_all_zeros; 562 mutex_exit(&connp->conn_lock); 563 564 error = ip_laddr_fanout_insert(connp); 565 if (error != 0) 566 goto late_error; 567 568 /* Bind succeeded */ 569 return (0); 570 571 late_error: 572 mutex_enter(&connp->conn_lock); 573 connp->conn_saddr_v6 = ipv6_all_zeros; 574 connp->conn_bound_addr_v6 = ipv6_all_zeros; 575 connp->conn_laddr_v6 = ipv6_all_zeros; 576 if (scopeid != 0) { 577 connp->conn_ixa->ixa_flags &= ~IXAF_SCOPEID_SET; 578 connp->conn_incoming_ifindex = connp->conn_bound_if; 579 } 580 icmp->icmp_state = TS_UNBND; 581 connp->conn_v6lastdst = ipv6_all_zeros; 582 connp->conn_lport = 0; 583 584 /* Restore the header that was built above - different source address */ 585 (void) icmp_build_hdr_template(connp, &connp->conn_saddr_v6, 586 &connp->conn_faddr_v6, connp->conn_flowinfo); 587 mutex_exit(&connp->conn_lock); 588 return (error); 589 } 590 591 /* 592 * Tell IP to just bind to the protocol. 593 */ 594 static void 595 icmp_bind_proto(icmp_t *icmp) 596 { 597 conn_t *connp = icmp->icmp_connp; 598 599 mutex_enter(&connp->conn_lock); 600 connp->conn_saddr_v6 = ipv6_all_zeros; 601 connp->conn_laddr_v6 = ipv6_all_zeros; 602 connp->conn_faddr_v6 = ipv6_all_zeros; 603 connp->conn_v6lastdst = ipv6_all_zeros; 604 mutex_exit(&connp->conn_lock); 605 606 (void) ip_laddr_fanout_insert(connp); 607 } 608 609 /* 610 * This routine handles each T_CONN_REQ message passed to icmp. It 611 * associates a default destination address with the stream. 612 * 613 * After various error checks are completed, icmp_connect() lays 614 * the target address and port into the composite header template. 615 * Then we ask IP for information, including a source address if we didn't 616 * already have one. Finally we send up the T_OK_ACK reply message. 617 */ 618 static void 619 icmp_tpi_connect(queue_t *q, mblk_t *mp) 620 { 621 conn_t *connp = Q_TO_CONN(q); 622 struct T_conn_req *tcr; 623 struct sockaddr *sa; 624 socklen_t len; 625 int error; 626 cred_t *cr; 627 pid_t pid; 628 /* 629 * All Solaris components should pass a db_credp 630 * for this TPI message, hence we ASSERT. 631 * But in case there is some other M_PROTO that looks 632 * like a TPI message sent by some other kernel 633 * component, we check and return an error. 634 */ 635 cr = msg_getcred(mp, &pid); 636 ASSERT(cr != NULL); 637 if (cr == NULL) { 638 icmp_err_ack(q, mp, TSYSERR, EINVAL); 639 return; 640 } 641 642 tcr = (struct T_conn_req *)mp->b_rptr; 643 /* Sanity checks */ 644 if ((mp->b_wptr - mp->b_rptr) < sizeof (struct T_conn_req)) { 645 icmp_err_ack(q, mp, TPROTO, 0); 646 return; 647 } 648 649 if (tcr->OPT_length != 0) { 650 icmp_err_ack(q, mp, TBADOPT, 0); 651 return; 652 } 653 654 len = tcr->DEST_length; 655 656 switch (len) { 657 default: 658 icmp_err_ack(q, mp, TBADADDR, 0); 659 return; 660 case sizeof (sin_t): 661 sa = (struct sockaddr *)mi_offset_param(mp, tcr->DEST_offset, 662 sizeof (sin_t)); 663 break; 664 case sizeof (sin6_t): 665 sa = (struct sockaddr *)mi_offset_param(mp, 666 tcr->DEST_offset, sizeof (sin6_t)); 667 break; 668 } 669 670 error = proto_verify_ip_addr(connp->conn_family, sa, len); 671 if (error != 0) { 672 icmp_err_ack(q, mp, TSYSERR, error); 673 return; 674 } 675 676 error = rawip_do_connect(connp, sa, len, cr, pid); 677 if (error != 0) { 678 if (error < 0) { 679 icmp_err_ack(q, mp, -error, 0); 680 } else { 681 icmp_err_ack(q, mp, 0, error); 682 } 683 } else { 684 mblk_t *mp1; 685 686 /* 687 * We have to send a connection confirmation to 688 * keep TLI happy. 689 */ 690 if (connp->conn_family == AF_INET) { 691 mp1 = mi_tpi_conn_con(NULL, (char *)sa, 692 sizeof (sin_t), NULL, 0); 693 } else { 694 ASSERT(connp->conn_family == AF_INET6); 695 mp1 = mi_tpi_conn_con(NULL, (char *)sa, 696 sizeof (sin6_t), NULL, 0); 697 } 698 if (mp1 == NULL) { 699 icmp_err_ack(q, mp, TSYSERR, ENOMEM); 700 return; 701 } 702 703 /* 704 * Send ok_ack for T_CONN_REQ 705 */ 706 mp = mi_tpi_ok_ack_alloc(mp); 707 if (mp == NULL) { 708 /* Unable to reuse the T_CONN_REQ for the ack. */ 709 icmp_err_ack_prim(q, mp1, T_CONN_REQ, TSYSERR, ENOMEM); 710 return; 711 } 712 putnext(connp->conn_rq, mp); 713 putnext(connp->conn_rq, mp1); 714 } 715 } 716 717 static int 718 rawip_do_connect(conn_t *connp, const struct sockaddr *sa, socklen_t len, 719 cred_t *cr, pid_t pid) 720 { 721 icmp_t *icmp; 722 sin_t *sin; 723 sin6_t *sin6; 724 int error; 725 uint16_t dstport; 726 ipaddr_t v4dst; 727 in6_addr_t v6dst; 728 uint32_t flowinfo; 729 ip_xmit_attr_t *ixa; 730 ip_xmit_attr_t *oldixa; 731 uint_t scopeid = 0; 732 uint_t srcid = 0; 733 in6_addr_t v6src = connp->conn_saddr_v6; 734 735 icmp = connp->conn_icmp; 736 737 if (sa == NULL || !OK_32PTR((char *)sa)) { 738 return (EINVAL); 739 } 740 741 ASSERT(sa != NULL && len != 0); 742 sin = NULL; 743 sin6 = NULL; 744 dstport = 0; 745 flowinfo = 0; 746 v4dst = INADDR_ANY; 747 748 /* 749 * Determine packet type based on type of address passed in 750 * the request should contain an IPv4 or IPv6 address. 751 * Make sure that address family matches the type of 752 * family of the address passed down. 753 */ 754 switch (len) { 755 case sizeof (sin_t): 756 sin = (sin_t *)sa; 757 758 v4dst = sin->sin_addr.s_addr; 759 dstport = sin->sin_port; 760 IN6_IPADDR_TO_V4MAPPED(v4dst, &v6dst); 761 ASSERT(connp->conn_ipversion == IPV4_VERSION); 762 break; 763 764 case sizeof (sin6_t): 765 sin6 = (sin6_t *)sa; 766 767 /* No support for mapped addresses on raw sockets */ 768 if (IN6_IS_ADDR_V4MAPPED(&sin6->sin6_addr)) { 769 return (EADDRNOTAVAIL); 770 } 771 v6dst = sin6->sin6_addr; 772 dstport = sin6->sin6_port; 773 ASSERT(connp->conn_ipversion == IPV6_VERSION); 774 flowinfo = sin6->sin6_flowinfo; 775 if (IN6_IS_ADDR_LINKLOCAL(&sin6->sin6_addr)) 776 scopeid = sin6->sin6_scope_id; 777 srcid = sin6->__sin6_src_id; 778 if (srcid != 0 && IN6_IS_ADDR_UNSPECIFIED(&v6src)) { 779 /* Due to check above, we know sin6_addr is v6-only. */ 780 if (!ip_srcid_find_id(srcid, &v6src, IPCL_ZONEID(connp), 781 B_FALSE, connp->conn_netstack)) { 782 /* Mismatch - v6src would be v4mapped. */ 783 return (EADDRNOTAVAIL); 784 } 785 } 786 break; 787 } 788 789 /* 790 * If there is a different thread using conn_ixa then we get a new 791 * copy and cut the old one loose from conn_ixa. Otherwise we use 792 * conn_ixa and prevent any other thread from using/changing it. 793 * Once connect() is done other threads can use conn_ixa since the 794 * refcnt will be back at one. 795 * We defer updating conn_ixa until later to handle any concurrent 796 * conn_ixa_cleanup thread. 797 */ 798 ixa = conn_get_ixa(connp, B_FALSE); 799 if (ixa == NULL) 800 return (ENOMEM); 801 802 mutex_enter(&connp->conn_lock); 803 /* 804 * This icmp_t must have bound already before doing a connect. 805 * Reject if a connect is in progress (we drop conn_lock during 806 * rawip_do_connect). 807 */ 808 if (icmp->icmp_state == TS_UNBND || icmp->icmp_state == TS_WCON_CREQ) { 809 mutex_exit(&connp->conn_lock); 810 ixa_refrele(ixa); 811 return (-TOUTSTATE); 812 } 813 814 if (icmp->icmp_state == TS_DATA_XFER) { 815 /* Already connected - clear out state */ 816 if (connp->conn_mcbc_bind) 817 connp->conn_saddr_v6 = ipv6_all_zeros; 818 else 819 connp->conn_saddr_v6 = connp->conn_bound_addr_v6; 820 connp->conn_laddr_v6 = connp->conn_bound_addr_v6; 821 connp->conn_faddr_v6 = ipv6_all_zeros; 822 icmp->icmp_state = TS_IDLE; 823 } 824 825 /* 826 * Use sin_port/sin6_port since applications like psh use SOCK_RAW 827 * with IPPROTO_TCP. 828 */ 829 connp->conn_fport = dstport; 830 if (connp->conn_ipversion == IPV4_VERSION) { 831 /* 832 * Interpret a zero destination to mean loopback. 833 * Update the T_CONN_REQ (sin/sin6) since it is used to 834 * generate the T_CONN_CON. 835 */ 836 if (v4dst == INADDR_ANY) { 837 v4dst = htonl(INADDR_LOOPBACK); 838 IN6_IPADDR_TO_V4MAPPED(v4dst, &v6dst); 839 ASSERT(connp->conn_family == AF_INET); 840 sin->sin_addr.s_addr = v4dst; 841 } 842 connp->conn_faddr_v6 = v6dst; 843 connp->conn_flowinfo = 0; 844 } else { 845 ASSERT(connp->conn_ipversion == IPV6_VERSION); 846 /* 847 * Interpret a zero destination to mean loopback. 848 * Update the T_CONN_REQ (sin/sin6) since it is used to 849 * generate the T_CONN_CON. 850 */ 851 if (IN6_IS_ADDR_UNSPECIFIED(&v6dst)) { 852 v6dst = ipv6_loopback; 853 sin6->sin6_addr = v6dst; 854 } 855 connp->conn_faddr_v6 = v6dst; 856 connp->conn_flowinfo = flowinfo; 857 } 858 859 /* 860 * We update our cred/cpid based on the caller of connect 861 */ 862 if (connp->conn_cred != cr) { 863 crhold(cr); 864 crfree(connp->conn_cred); 865 connp->conn_cred = cr; 866 } 867 connp->conn_cpid = pid; 868 ASSERT(!(ixa->ixa_free_flags & IXA_FREE_CRED)); 869 ixa->ixa_cred = cr; 870 ixa->ixa_cpid = pid; 871 if (is_system_labeled()) { 872 /* We need to restart with a label based on the cred */ 873 ip_xmit_attr_restore_tsl(ixa, ixa->ixa_cred); 874 } 875 876 if (scopeid != 0) { 877 ixa->ixa_flags |= IXAF_SCOPEID_SET; 878 ixa->ixa_scopeid = scopeid; 879 connp->conn_incoming_ifindex = scopeid; 880 } else { 881 ixa->ixa_flags &= ~IXAF_SCOPEID_SET; 882 connp->conn_incoming_ifindex = connp->conn_bound_if; 883 } 884 885 /* 886 * conn_connect will drop conn_lock and reacquire it. 887 * To prevent a send* from messing with this icmp_t while the lock 888 * is dropped we set icmp_state and clear conn_v6lastdst. 889 * That will make all send* fail with EISCONN. 890 */ 891 connp->conn_v6lastdst = ipv6_all_zeros; 892 icmp->icmp_state = TS_WCON_CREQ; 893 894 error = conn_connect(connp, NULL, IPDF_ALLOW_MCBC); 895 mutex_exit(&connp->conn_lock); 896 if (error != 0) 897 goto connect_failed; 898 899 /* 900 * The addresses have been verified. Time to insert in 901 * the correct fanout list. 902 */ 903 error = ipcl_conn_insert(connp); 904 if (error != 0) 905 goto connect_failed; 906 907 mutex_enter(&connp->conn_lock); 908 error = icmp_build_hdr_template(connp, &connp->conn_saddr_v6, 909 &connp->conn_faddr_v6, connp->conn_flowinfo); 910 if (error != 0) { 911 mutex_exit(&connp->conn_lock); 912 goto connect_failed; 913 } 914 915 icmp->icmp_state = TS_DATA_XFER; 916 /* Record this as the "last" send even though we haven't sent any */ 917 connp->conn_v6lastdst = connp->conn_faddr_v6; 918 connp->conn_lastipversion = connp->conn_ipversion; 919 connp->conn_lastdstport = connp->conn_fport; 920 connp->conn_lastflowinfo = connp->conn_flowinfo; 921 connp->conn_lastscopeid = scopeid; 922 connp->conn_lastsrcid = srcid; 923 /* Also remember a source to use together with lastdst */ 924 connp->conn_v6lastsrc = v6src; 925 926 oldixa = conn_replace_ixa(connp, ixa); 927 mutex_exit(&connp->conn_lock); 928 ixa_refrele(oldixa); 929 930 ixa_refrele(ixa); 931 return (0); 932 933 connect_failed: 934 if (ixa != NULL) 935 ixa_refrele(ixa); 936 mutex_enter(&connp->conn_lock); 937 icmp->icmp_state = TS_IDLE; 938 /* In case the source address was set above */ 939 if (connp->conn_mcbc_bind) 940 connp->conn_saddr_v6 = ipv6_all_zeros; 941 else 942 connp->conn_saddr_v6 = connp->conn_bound_addr_v6; 943 connp->conn_laddr_v6 = connp->conn_bound_addr_v6; 944 connp->conn_faddr_v6 = ipv6_all_zeros; 945 connp->conn_v6lastdst = ipv6_all_zeros; 946 connp->conn_flowinfo = 0; 947 948 (void) icmp_build_hdr_template(connp, &connp->conn_saddr_v6, 949 &connp->conn_faddr_v6, connp->conn_flowinfo); 950 mutex_exit(&connp->conn_lock); 951 return (error); 952 } 953 954 static void 955 rawip_do_close(conn_t *connp) 956 { 957 ASSERT(connp != NULL && IPCL_IS_RAWIP(connp)); 958 959 ip_quiesce_conn(connp); 960 961 if (!IPCL_IS_NONSTR(connp)) { 962 qprocsoff(connp->conn_rq); 963 } 964 965 icmp_close_free(connp); 966 967 /* 968 * Now we are truly single threaded on this stream, and can 969 * delete the things hanging off the connp, and finally the connp. 970 * We removed this connp from the fanout list, it cannot be 971 * accessed thru the fanouts, and we already waited for the 972 * conn_ref to drop to 0. We are already in close, so 973 * there cannot be any other thread from the top. qprocsoff 974 * has completed, and service has completed or won't run in 975 * future. 976 */ 977 ASSERT(connp->conn_ref == 1); 978 979 if (!IPCL_IS_NONSTR(connp)) { 980 inet_minor_free(connp->conn_minor_arena, connp->conn_dev); 981 } else { 982 ip_free_helper_stream(connp); 983 } 984 985 connp->conn_ref--; 986 ipcl_conn_destroy(connp); 987 } 988 989 /* ARGSUSED */ 990 static int 991 icmp_close(queue_t *q, int flags, cred_t *credp __unused) 992 { 993 conn_t *connp; 994 995 if (flags & SO_FALLBACK) { 996 /* 997 * stream is being closed while in fallback 998 * simply free the resources that were allocated 999 */ 1000 inet_minor_free(WR(q)->q_ptr, (dev_t)(RD(q)->q_ptr)); 1001 qprocsoff(q); 1002 goto done; 1003 } 1004 1005 connp = Q_TO_CONN(q); 1006 (void) rawip_do_close(connp); 1007 done: 1008 q->q_ptr = WR(q)->q_ptr = NULL; 1009 return (0); 1010 } 1011 1012 static void 1013 icmp_close_free(conn_t *connp) 1014 { 1015 icmp_t *icmp = connp->conn_icmp; 1016 1017 if (icmp->icmp_filter != NULL) { 1018 kmem_free(icmp->icmp_filter, sizeof (icmp6_filter_t)); 1019 icmp->icmp_filter = NULL; 1020 } 1021 1022 /* 1023 * Clear any fields which the kmem_cache constructor clears. 1024 * Only icmp_connp needs to be preserved. 1025 * TBD: We should make this more efficient to avoid clearing 1026 * everything. 1027 */ 1028 ASSERT(icmp->icmp_connp == connp); 1029 bzero(icmp, sizeof (icmp_t)); 1030 icmp->icmp_connp = connp; 1031 } 1032 1033 /* 1034 * This routine handles each T_DISCON_REQ message passed to icmp 1035 * as an indicating that ICMP is no longer connected. This results 1036 * in telling IP to restore the binding to just the local address. 1037 */ 1038 static int 1039 icmp_do_disconnect(conn_t *connp) 1040 { 1041 icmp_t *icmp = connp->conn_icmp; 1042 int error; 1043 1044 mutex_enter(&connp->conn_lock); 1045 if (icmp->icmp_state != TS_DATA_XFER) { 1046 mutex_exit(&connp->conn_lock); 1047 return (-TOUTSTATE); 1048 } 1049 if (connp->conn_mcbc_bind) 1050 connp->conn_saddr_v6 = ipv6_all_zeros; 1051 else 1052 connp->conn_saddr_v6 = connp->conn_bound_addr_v6; 1053 connp->conn_laddr_v6 = connp->conn_bound_addr_v6; 1054 connp->conn_faddr_v6 = ipv6_all_zeros; 1055 icmp->icmp_state = TS_IDLE; 1056 1057 connp->conn_v6lastdst = ipv6_all_zeros; 1058 error = icmp_build_hdr_template(connp, &connp->conn_saddr_v6, 1059 &connp->conn_faddr_v6, connp->conn_flowinfo); 1060 mutex_exit(&connp->conn_lock); 1061 if (error != 0) 1062 return (error); 1063 1064 /* 1065 * Tell IP to remove the full binding and revert 1066 * to the local address binding. 1067 */ 1068 return (ip_laddr_fanout_insert(connp)); 1069 } 1070 1071 static void 1072 icmp_tpi_disconnect(queue_t *q, mblk_t *mp) 1073 { 1074 conn_t *connp = Q_TO_CONN(q); 1075 int error; 1076 1077 /* 1078 * Allocate the largest primitive we need to send back 1079 * T_error_ack is > than T_ok_ack 1080 */ 1081 mp = reallocb(mp, sizeof (struct T_error_ack), 1); 1082 if (mp == NULL) { 1083 /* Unable to reuse the T_DISCON_REQ for the ack. */ 1084 icmp_err_ack_prim(q, mp, T_DISCON_REQ, TSYSERR, ENOMEM); 1085 return; 1086 } 1087 1088 error = icmp_do_disconnect(connp); 1089 1090 if (error != 0) { 1091 if (error > 0) { 1092 icmp_err_ack(q, mp, 0, error); 1093 } else { 1094 icmp_err_ack(q, mp, -error, 0); 1095 } 1096 } else { 1097 mp = mi_tpi_ok_ack_alloc(mp); 1098 ASSERT(mp != NULL); 1099 qreply(q, mp); 1100 } 1101 } 1102 1103 static int 1104 icmp_disconnect(conn_t *connp) 1105 { 1106 int error; 1107 1108 connp->conn_dgram_errind = B_FALSE; 1109 1110 error = icmp_do_disconnect(connp); 1111 1112 if (error < 0) 1113 error = proto_tlitosyserr(-error); 1114 return (error); 1115 } 1116 1117 /* This routine creates a T_ERROR_ACK message and passes it upstream. */ 1118 static void 1119 icmp_err_ack(queue_t *q, mblk_t *mp, t_scalar_t t_error, int sys_error) 1120 { 1121 if ((mp = mi_tpi_err_ack_alloc(mp, t_error, sys_error)) != NULL) 1122 qreply(q, mp); 1123 } 1124 1125 /* Shorthand to generate and send TPI error acks to our client */ 1126 static void 1127 icmp_err_ack_prim(queue_t *q, mblk_t *mp, t_scalar_t primitive, 1128 t_scalar_t t_error, int sys_error) 1129 { 1130 struct T_error_ack *teackp; 1131 1132 if ((mp = tpi_ack_alloc(mp, sizeof (struct T_error_ack), 1133 M_PCPROTO, T_ERROR_ACK)) != NULL) { 1134 teackp = (struct T_error_ack *)mp->b_rptr; 1135 teackp->ERROR_prim = primitive; 1136 teackp->TLI_error = t_error; 1137 teackp->UNIX_error = sys_error; 1138 qreply(q, mp); 1139 } 1140 } 1141 1142 /* 1143 * icmp_icmp_input is called as conn_recvicmp to process ICMP messages. 1144 * Generates the appropriate T_UDERROR_IND for permanent (non-transient) errors. 1145 * Assumes that IP has pulled up everything up to and including the ICMP header. 1146 */ 1147 /* ARGSUSED2 */ 1148 static void 1149 icmp_icmp_input(void *arg1, mblk_t *mp, void *arg2, ip_recv_attr_t *ira) 1150 { 1151 conn_t *connp = (conn_t *)arg1; 1152 icmp_t *icmp = connp->conn_icmp; 1153 icmph_t *icmph; 1154 ipha_t *ipha; 1155 int iph_hdr_length; 1156 sin_t sin; 1157 mblk_t *mp1; 1158 int error = 0; 1159 1160 ipha = (ipha_t *)mp->b_rptr; 1161 1162 ASSERT(OK_32PTR(mp->b_rptr)); 1163 1164 if (IPH_HDR_VERSION(ipha) != IPV4_VERSION) { 1165 ASSERT(IPH_HDR_VERSION(ipha) == IPV6_VERSION); 1166 icmp_icmp_error_ipv6(connp, mp, ira); 1167 return; 1168 } 1169 ASSERT(IPH_HDR_VERSION(ipha) == IPV4_VERSION); 1170 1171 /* Skip past the outer IP and ICMP headers */ 1172 ASSERT(IPH_HDR_LENGTH(ipha) == ira->ira_ip_hdr_length); 1173 iph_hdr_length = ira->ira_ip_hdr_length; 1174 icmph = (icmph_t *)&mp->b_rptr[iph_hdr_length]; 1175 ipha = (ipha_t *)&icmph[1]; /* Inner IP header */ 1176 1177 iph_hdr_length = IPH_HDR_LENGTH(ipha); 1178 1179 switch (icmph->icmph_type) { 1180 case ICMP_DEST_UNREACHABLE: 1181 switch (icmph->icmph_code) { 1182 case ICMP_FRAGMENTATION_NEEDED: { 1183 ipha_t *ipha; 1184 ip_xmit_attr_t *ixa; 1185 /* 1186 * IP has already adjusted the path MTU. 1187 * But we need to adjust DF for IPv4. 1188 */ 1189 if (connp->conn_ipversion != IPV4_VERSION) 1190 break; 1191 1192 ixa = conn_get_ixa(connp, B_FALSE); 1193 if (ixa == NULL || ixa->ixa_ire == NULL) { 1194 /* 1195 * Some other thread holds conn_ixa. We will 1196 * redo this on the next ICMP too big. 1197 */ 1198 if (ixa != NULL) 1199 ixa_refrele(ixa); 1200 break; 1201 } 1202 (void) ip_get_pmtu(ixa); 1203 1204 mutex_enter(&connp->conn_lock); 1205 ipha = (ipha_t *)connp->conn_ht_iphc; 1206 if (ixa->ixa_flags & IXAF_PMTU_IPV4_DF) { 1207 ipha->ipha_fragment_offset_and_flags |= 1208 IPH_DF_HTONS; 1209 } else { 1210 ipha->ipha_fragment_offset_and_flags &= 1211 ~IPH_DF_HTONS; 1212 } 1213 mutex_exit(&connp->conn_lock); 1214 ixa_refrele(ixa); 1215 break; 1216 } 1217 case ICMP_PORT_UNREACHABLE: 1218 case ICMP_PROTOCOL_UNREACHABLE: 1219 error = ECONNREFUSED; 1220 break; 1221 default: 1222 /* Transient errors */ 1223 break; 1224 } 1225 break; 1226 default: 1227 /* Transient errors */ 1228 break; 1229 } 1230 if (error == 0) { 1231 freemsg(mp); 1232 return; 1233 } 1234 1235 /* 1236 * Deliver T_UDERROR_IND when the application has asked for it. 1237 * The socket layer enables this automatically when connected. 1238 */ 1239 if (!connp->conn_dgram_errind) { 1240 freemsg(mp); 1241 return; 1242 } 1243 1244 sin = sin_null; 1245 sin.sin_family = AF_INET; 1246 sin.sin_addr.s_addr = ipha->ipha_dst; 1247 1248 if (IPCL_IS_NONSTR(connp)) { 1249 mutex_enter(&connp->conn_lock); 1250 if (icmp->icmp_state == TS_DATA_XFER) { 1251 if (sin.sin_addr.s_addr == connp->conn_faddr_v4) { 1252 mutex_exit(&connp->conn_lock); 1253 (*connp->conn_upcalls->su_set_error) 1254 (connp->conn_upper_handle, error); 1255 goto done; 1256 } 1257 } else { 1258 icmp->icmp_delayed_error = error; 1259 *((sin_t *)&icmp->icmp_delayed_addr) = sin; 1260 } 1261 mutex_exit(&connp->conn_lock); 1262 } else { 1263 mp1 = mi_tpi_uderror_ind((char *)&sin, sizeof (sin_t), NULL, 0, 1264 error); 1265 if (mp1 != NULL) 1266 putnext(connp->conn_rq, mp1); 1267 } 1268 done: 1269 freemsg(mp); 1270 } 1271 1272 /* 1273 * icmp_icmp_error_ipv6 is called by icmp_icmp_error to process ICMP for IPv6. 1274 * Generates the appropriate T_UDERROR_IND for permanent (non-transient) errors. 1275 * Assumes that IP has pulled up all the extension headers as well as the 1276 * ICMPv6 header. 1277 */ 1278 static void 1279 icmp_icmp_error_ipv6(conn_t *connp, mblk_t *mp, ip_recv_attr_t *ira) 1280 { 1281 icmp6_t *icmp6; 1282 ip6_t *ip6h, *outer_ip6h; 1283 uint16_t iph_hdr_length; 1284 uint8_t *nexthdrp; 1285 sin6_t sin6; 1286 mblk_t *mp1; 1287 int error = 0; 1288 icmp_t *icmp = connp->conn_icmp; 1289 1290 outer_ip6h = (ip6_t *)mp->b_rptr; 1291 #ifdef DEBUG 1292 if (outer_ip6h->ip6_nxt != IPPROTO_ICMPV6) 1293 iph_hdr_length = ip_hdr_length_v6(mp, outer_ip6h); 1294 else 1295 iph_hdr_length = IPV6_HDR_LEN; 1296 ASSERT(iph_hdr_length == ira->ira_ip_hdr_length); 1297 #endif 1298 /* Skip past the outer IP and ICMP headers */ 1299 iph_hdr_length = ira->ira_ip_hdr_length; 1300 icmp6 = (icmp6_t *)&mp->b_rptr[iph_hdr_length]; 1301 1302 ip6h = (ip6_t *)&icmp6[1]; /* Inner IP header */ 1303 if (!ip_hdr_length_nexthdr_v6(mp, ip6h, &iph_hdr_length, &nexthdrp)) { 1304 freemsg(mp); 1305 return; 1306 } 1307 1308 switch (icmp6->icmp6_type) { 1309 case ICMP6_DST_UNREACH: 1310 switch (icmp6->icmp6_code) { 1311 case ICMP6_DST_UNREACH_NOPORT: 1312 error = ECONNREFUSED; 1313 break; 1314 case ICMP6_DST_UNREACH_ADMIN: 1315 case ICMP6_DST_UNREACH_NOROUTE: 1316 case ICMP6_DST_UNREACH_BEYONDSCOPE: 1317 case ICMP6_DST_UNREACH_ADDR: 1318 /* Transient errors */ 1319 break; 1320 default: 1321 break; 1322 } 1323 break; 1324 case ICMP6_PACKET_TOO_BIG: { 1325 struct T_unitdata_ind *tudi; 1326 struct T_opthdr *toh; 1327 size_t udi_size; 1328 mblk_t *newmp; 1329 t_scalar_t opt_length = sizeof (struct T_opthdr) + 1330 sizeof (struct ip6_mtuinfo); 1331 sin6_t *sin6; 1332 struct ip6_mtuinfo *mtuinfo; 1333 1334 /* 1335 * If the application has requested to receive path mtu 1336 * information, send up an empty message containing an 1337 * IPV6_PATHMTU ancillary data item. 1338 */ 1339 if (!connp->conn_ipv6_recvpathmtu) 1340 break; 1341 1342 udi_size = sizeof (struct T_unitdata_ind) + sizeof (sin6_t) + 1343 opt_length; 1344 if ((newmp = allocb(udi_size, BPRI_MED)) == NULL) { 1345 BUMP_MIB(&icmp->icmp_is->is_rawip_mib, rawipInErrors); 1346 break; 1347 } 1348 1349 /* 1350 * newmp->b_cont is left to NULL on purpose. This is an 1351 * empty message containing only ancillary data. 1352 */ 1353 newmp->b_datap->db_type = M_PROTO; 1354 tudi = (struct T_unitdata_ind *)newmp->b_rptr; 1355 newmp->b_wptr = (uchar_t *)tudi + udi_size; 1356 tudi->PRIM_type = T_UNITDATA_IND; 1357 tudi->SRC_length = sizeof (sin6_t); 1358 tudi->SRC_offset = sizeof (struct T_unitdata_ind); 1359 tudi->OPT_offset = tudi->SRC_offset + sizeof (sin6_t); 1360 tudi->OPT_length = opt_length; 1361 1362 sin6 = (sin6_t *)&tudi[1]; 1363 bzero(sin6, sizeof (sin6_t)); 1364 sin6->sin6_family = AF_INET6; 1365 sin6->sin6_addr = connp->conn_faddr_v6; 1366 1367 toh = (struct T_opthdr *)&sin6[1]; 1368 toh->level = IPPROTO_IPV6; 1369 toh->name = IPV6_PATHMTU; 1370 toh->len = opt_length; 1371 toh->status = 0; 1372 1373 mtuinfo = (struct ip6_mtuinfo *)&toh[1]; 1374 bzero(mtuinfo, sizeof (struct ip6_mtuinfo)); 1375 mtuinfo->ip6m_addr.sin6_family = AF_INET6; 1376 mtuinfo->ip6m_addr.sin6_addr = ip6h->ip6_dst; 1377 mtuinfo->ip6m_mtu = icmp6->icmp6_mtu; 1378 /* 1379 * We've consumed everything we need from the original 1380 * message. Free it, then send our empty message. 1381 */ 1382 freemsg(mp); 1383 icmp_ulp_recv(connp, newmp, msgdsize(newmp)); 1384 return; 1385 } 1386 case ICMP6_TIME_EXCEEDED: 1387 /* Transient errors */ 1388 break; 1389 case ICMP6_PARAM_PROB: 1390 /* If this corresponds to an ICMP_PROTOCOL_UNREACHABLE */ 1391 if (icmp6->icmp6_code == ICMP6_PARAMPROB_NEXTHEADER && 1392 (uchar_t *)ip6h + icmp6->icmp6_pptr == 1393 (uchar_t *)nexthdrp) { 1394 error = ECONNREFUSED; 1395 break; 1396 } 1397 break; 1398 } 1399 if (error == 0) { 1400 freemsg(mp); 1401 return; 1402 } 1403 1404 /* 1405 * Deliver T_UDERROR_IND when the application has asked for it. 1406 * The socket layer enables this automatically when connected. 1407 */ 1408 if (!connp->conn_dgram_errind) { 1409 freemsg(mp); 1410 return; 1411 } 1412 1413 sin6 = sin6_null; 1414 sin6.sin6_family = AF_INET6; 1415 sin6.sin6_addr = ip6h->ip6_dst; 1416 sin6.sin6_flowinfo = ip6h->ip6_vcf & ~IPV6_VERS_AND_FLOW_MASK; 1417 if (IPCL_IS_NONSTR(connp)) { 1418 mutex_enter(&connp->conn_lock); 1419 if (icmp->icmp_state == TS_DATA_XFER) { 1420 if (IN6_ARE_ADDR_EQUAL(&sin6.sin6_addr, 1421 &connp->conn_faddr_v6)) { 1422 mutex_exit(&connp->conn_lock); 1423 (*connp->conn_upcalls->su_set_error) 1424 (connp->conn_upper_handle, error); 1425 goto done; 1426 } 1427 } else { 1428 icmp->icmp_delayed_error = error; 1429 *((sin6_t *)&icmp->icmp_delayed_addr) = sin6; 1430 } 1431 mutex_exit(&connp->conn_lock); 1432 } else { 1433 mp1 = mi_tpi_uderror_ind((char *)&sin6, sizeof (sin6_t), 1434 NULL, 0, error); 1435 if (mp1 != NULL) 1436 putnext(connp->conn_rq, mp1); 1437 } 1438 done: 1439 freemsg(mp); 1440 } 1441 1442 /* 1443 * This routine responds to T_ADDR_REQ messages. It is called by icmp_wput. 1444 * The local address is filled in if endpoint is bound. The remote address 1445 * is filled in if remote address has been precified ("connected endpoint") 1446 * (The concept of connected CLTS sockets is alien to published TPI 1447 * but we support it anyway). 1448 */ 1449 static void 1450 icmp_addr_req(queue_t *q, mblk_t *mp) 1451 { 1452 struct sockaddr *sa; 1453 mblk_t *ackmp; 1454 struct T_addr_ack *taa; 1455 icmp_t *icmp = Q_TO_ICMP(q); 1456 conn_t *connp = icmp->icmp_connp; 1457 uint_t addrlen; 1458 1459 /* Make it large enough for worst case */ 1460 ackmp = reallocb(mp, sizeof (struct T_addr_ack) + 1461 2 * sizeof (sin6_t), 1); 1462 if (ackmp == NULL) { 1463 icmp_err_ack(q, mp, TSYSERR, ENOMEM); 1464 return; 1465 } 1466 taa = (struct T_addr_ack *)ackmp->b_rptr; 1467 1468 bzero(taa, sizeof (struct T_addr_ack)); 1469 ackmp->b_wptr = (uchar_t *)&taa[1]; 1470 1471 taa->PRIM_type = T_ADDR_ACK; 1472 ackmp->b_datap->db_type = M_PCPROTO; 1473 1474 if (connp->conn_family == AF_INET) 1475 addrlen = sizeof (sin_t); 1476 else 1477 addrlen = sizeof (sin6_t); 1478 1479 mutex_enter(&connp->conn_lock); 1480 /* 1481 * Note: Following code assumes 32 bit alignment of basic 1482 * data structures like sin_t and struct T_addr_ack. 1483 */ 1484 if (icmp->icmp_state != TS_UNBND) { 1485 /* 1486 * Fill in local address first 1487 */ 1488 taa->LOCADDR_offset = sizeof (*taa); 1489 taa->LOCADDR_length = addrlen; 1490 sa = (struct sockaddr *)&taa[1]; 1491 (void) conn_getsockname(connp, sa, &addrlen); 1492 ackmp->b_wptr += addrlen; 1493 } 1494 if (icmp->icmp_state == TS_DATA_XFER) { 1495 /* 1496 * connected, fill remote address too 1497 */ 1498 taa->REMADDR_length = addrlen; 1499 /* assumed 32-bit alignment */ 1500 taa->REMADDR_offset = taa->LOCADDR_offset + taa->LOCADDR_length; 1501 sa = (struct sockaddr *)(ackmp->b_rptr + taa->REMADDR_offset); 1502 (void) conn_getpeername(connp, sa, &addrlen); 1503 ackmp->b_wptr += addrlen; 1504 } 1505 mutex_exit(&connp->conn_lock); 1506 ASSERT(ackmp->b_wptr <= ackmp->b_datap->db_lim); 1507 qreply(q, ackmp); 1508 } 1509 1510 static void 1511 icmp_copy_info(struct T_info_ack *tap, icmp_t *icmp) 1512 { 1513 conn_t *connp = icmp->icmp_connp; 1514 1515 *tap = icmp_g_t_info_ack; 1516 1517 if (connp->conn_family == AF_INET6) 1518 tap->ADDR_size = sizeof (sin6_t); 1519 else 1520 tap->ADDR_size = sizeof (sin_t); 1521 tap->CURRENT_state = icmp->icmp_state; 1522 tap->OPT_size = icmp_max_optsize; 1523 } 1524 1525 static void 1526 icmp_do_capability_ack(icmp_t *icmp, struct T_capability_ack *tcap, 1527 t_uscalar_t cap_bits1) 1528 { 1529 tcap->CAP_bits1 = 0; 1530 1531 if (cap_bits1 & TC1_INFO) { 1532 icmp_copy_info(&tcap->INFO_ack, icmp); 1533 tcap->CAP_bits1 |= TC1_INFO; 1534 } 1535 } 1536 1537 /* 1538 * This routine responds to T_CAPABILITY_REQ messages. It is called by 1539 * icmp_wput. Much of the T_CAPABILITY_ACK information is copied from 1540 * icmp_g_t_info_ack. The current state of the stream is copied from 1541 * icmp_state. 1542 */ 1543 static void 1544 icmp_capability_req(queue_t *q, mblk_t *mp) 1545 { 1546 icmp_t *icmp = Q_TO_ICMP(q); 1547 t_uscalar_t cap_bits1; 1548 struct T_capability_ack *tcap; 1549 1550 cap_bits1 = ((struct T_capability_req *)mp->b_rptr)->CAP_bits1; 1551 1552 mp = tpi_ack_alloc(mp, sizeof (struct T_capability_ack), 1553 mp->b_datap->db_type, T_CAPABILITY_ACK); 1554 if (!mp) 1555 return; 1556 1557 tcap = (struct T_capability_ack *)mp->b_rptr; 1558 1559 icmp_do_capability_ack(icmp, tcap, cap_bits1); 1560 1561 qreply(q, mp); 1562 } 1563 1564 /* 1565 * This routine responds to T_INFO_REQ messages. It is called by icmp_wput. 1566 * Most of the T_INFO_ACK information is copied from icmp_g_t_info_ack. 1567 * The current state of the stream is copied from icmp_state. 1568 */ 1569 static void 1570 icmp_info_req(queue_t *q, mblk_t *mp) 1571 { 1572 icmp_t *icmp = Q_TO_ICMP(q); 1573 1574 /* Create a T_INFO_ACK message. */ 1575 mp = tpi_ack_alloc(mp, sizeof (struct T_info_ack), M_PCPROTO, 1576 T_INFO_ACK); 1577 if (!mp) 1578 return; 1579 icmp_copy_info((struct T_info_ack *)mp->b_rptr, icmp); 1580 qreply(q, mp); 1581 } 1582 1583 static int 1584 icmp_tpi_open(queue_t *q, dev_t *devp, int flag, int sflag, cred_t *credp, 1585 int family) 1586 { 1587 conn_t *connp; 1588 dev_t conn_dev; 1589 int error; 1590 1591 /* If the stream is already open, return immediately. */ 1592 if (q->q_ptr != NULL) 1593 return (0); 1594 1595 if (sflag == MODOPEN) 1596 return (EINVAL); 1597 1598 /* 1599 * Since ICMP is not used so heavily, allocating from the small 1600 * arena should be sufficient. 1601 */ 1602 if ((conn_dev = inet_minor_alloc(ip_minor_arena_sa)) == 0) { 1603 return (EBUSY); 1604 } 1605 1606 if (flag & SO_FALLBACK) { 1607 /* 1608 * Non streams socket needs a stream to fallback to 1609 */ 1610 RD(q)->q_ptr = (void *)conn_dev; 1611 WR(q)->q_qinfo = &icmp_fallback_sock_winit; 1612 WR(q)->q_ptr = (void *)ip_minor_arena_sa; 1613 qprocson(q); 1614 return (0); 1615 } 1616 1617 connp = rawip_do_open(family, credp, &error, KM_SLEEP); 1618 if (connp == NULL) { 1619 ASSERT(error != 0); 1620 inet_minor_free(ip_minor_arena_sa, conn_dev); 1621 return (error); 1622 } 1623 1624 *devp = makedevice(getemajor(*devp), (minor_t)conn_dev); 1625 connp->conn_dev = conn_dev; 1626 connp->conn_minor_arena = ip_minor_arena_sa; 1627 1628 /* 1629 * Initialize the icmp_t structure for this stream. 1630 */ 1631 q->q_ptr = connp; 1632 WR(q)->q_ptr = connp; 1633 connp->conn_rq = q; 1634 connp->conn_wq = WR(q); 1635 1636 WR(q)->q_hiwat = connp->conn_sndbuf; 1637 WR(q)->q_lowat = connp->conn_sndlowat; 1638 1639 qprocson(q); 1640 1641 /* Set the Stream head write offset. */ 1642 (void) proto_set_tx_wroff(q, connp, connp->conn_wroff); 1643 (void) proto_set_rx_hiwat(connp->conn_rq, connp, connp->conn_rcvbuf); 1644 1645 mutex_enter(&connp->conn_lock); 1646 connp->conn_state_flags &= ~CONN_INCIPIENT; 1647 mutex_exit(&connp->conn_lock); 1648 1649 icmp_bind_proto(connp->conn_icmp); 1650 1651 return (0); 1652 } 1653 1654 /* For /dev/icmp aka AF_INET open */ 1655 static int 1656 icmp_openv4(queue_t *q, dev_t *devp, int flag, int sflag, cred_t *credp) 1657 { 1658 return (icmp_tpi_open(q, devp, flag, sflag, credp, AF_INET)); 1659 } 1660 1661 /* For /dev/icmp6 aka AF_INET6 open */ 1662 static int 1663 icmp_openv6(queue_t *q, dev_t *devp, int flag, int sflag, cred_t *credp) 1664 { 1665 return (icmp_tpi_open(q, devp, flag, sflag, credp, AF_INET6)); 1666 } 1667 1668 /* 1669 * This is the open routine for icmp. It allocates a icmp_t structure for 1670 * the stream and, on the first open of the module, creates an ND table. 1671 */ 1672 static conn_t * 1673 rawip_do_open(int family, cred_t *credp, int *err, int flags) 1674 { 1675 icmp_t *icmp; 1676 conn_t *connp; 1677 zoneid_t zoneid; 1678 netstack_t *ns; 1679 icmp_stack_t *is; 1680 int len; 1681 boolean_t isv6 = B_FALSE; 1682 1683 *err = secpolicy_net_icmpaccess(credp); 1684 if (*err != 0) 1685 return (NULL); 1686 1687 if (family == AF_INET6) 1688 isv6 = B_TRUE; 1689 1690 ns = netstack_find_by_cred(credp); 1691 ASSERT(ns != NULL); 1692 is = ns->netstack_icmp; 1693 ASSERT(is != NULL); 1694 1695 /* 1696 * For exclusive stacks we set the zoneid to zero 1697 * to make ICMP operate as if in the global zone. 1698 */ 1699 if (ns->netstack_stackid != GLOBAL_NETSTACKID) 1700 zoneid = GLOBAL_ZONEID; 1701 else 1702 zoneid = crgetzoneid(credp); 1703 1704 ASSERT(flags == KM_SLEEP || flags == KM_NOSLEEP); 1705 1706 connp = ipcl_conn_create(IPCL_RAWIPCONN, flags, ns); 1707 icmp = connp->conn_icmp; 1708 1709 /* 1710 * ipcl_conn_create did a netstack_hold. Undo the hold that was 1711 * done by netstack_find_by_cred() 1712 */ 1713 netstack_rele(ns); 1714 1715 /* 1716 * Since this conn_t/icmp_t is not yet visible to anybody else we don't 1717 * need to lock anything. 1718 */ 1719 ASSERT(connp->conn_proto == IPPROTO_ICMP); 1720 ASSERT(connp->conn_icmp == icmp); 1721 ASSERT(icmp->icmp_connp == connp); 1722 1723 /* Set the initial state of the stream and the privilege status. */ 1724 icmp->icmp_state = TS_UNBND; 1725 connp->conn_ixa->ixa_flags |= IXAF_VERIFY_SOURCE; 1726 if (isv6) { 1727 connp->conn_family = AF_INET6; 1728 connp->conn_ipversion = IPV6_VERSION; 1729 connp->conn_ixa->ixa_flags &= ~IXAF_IS_IPV4; 1730 connp->conn_proto = IPPROTO_ICMPV6; 1731 /* May be changed by a SO_PROTOTYPE socket option. */ 1732 connp->conn_proto = IPPROTO_ICMPV6; 1733 connp->conn_ixa->ixa_protocol = connp->conn_proto; 1734 connp->conn_ixa->ixa_raw_cksum_offset = 2; 1735 connp->conn_default_ttl = is->is_ipv6_hoplimit; 1736 len = sizeof (ip6_t); 1737 } else { 1738 connp->conn_family = AF_INET; 1739 connp->conn_ipversion = IPV4_VERSION; 1740 connp->conn_ixa->ixa_flags |= IXAF_IS_IPV4; 1741 /* May be changed by a SO_PROTOTYPE socket option. */ 1742 connp->conn_proto = IPPROTO_ICMP; 1743 connp->conn_ixa->ixa_protocol = connp->conn_proto; 1744 connp->conn_default_ttl = is->is_ipv4_ttl; 1745 len = sizeof (ipha_t); 1746 } 1747 connp->conn_xmit_ipp.ipp_unicast_hops = connp->conn_default_ttl; 1748 1749 connp->conn_ixa->ixa_multicast_ttl = IP_DEFAULT_MULTICAST_TTL; 1750 1751 /* 1752 * For the socket of protocol IPPROTO_RAW or when IP_HDRINCL is set, 1753 * the checksum is provided in the pre-built packet. We clear 1754 * IXAF_SET_ULP_CKSUM to tell IP that the application has sent a 1755 * complete IP header and not to compute the transport checksum. 1756 */ 1757 connp->conn_ixa->ixa_flags |= IXAF_MULTICAST_LOOP | IXAF_SET_ULP_CKSUM; 1758 /* conn_allzones can not be set this early, hence no IPCL_ZONEID */ 1759 connp->conn_ixa->ixa_zoneid = zoneid; 1760 1761 connp->conn_zoneid = zoneid; 1762 1763 /* 1764 * If the caller has the process-wide flag set, then default to MAC 1765 * exempt mode. This allows read-down to unlabeled hosts. 1766 */ 1767 if (getpflags(NET_MAC_AWARE, credp) != 0) 1768 connp->conn_mac_mode = CONN_MAC_AWARE; 1769 1770 connp->conn_zone_is_global = (crgetzoneid(credp) == GLOBAL_ZONEID); 1771 1772 icmp->icmp_is = is; 1773 1774 connp->conn_rcvbuf = is->is_recv_hiwat; 1775 connp->conn_sndbuf = is->is_xmit_hiwat; 1776 connp->conn_sndlowat = is->is_xmit_lowat; 1777 connp->conn_rcvlowat = icmp_mod_info.mi_lowat; 1778 1779 connp->conn_wroff = len + is->is_wroff_extra; 1780 connp->conn_so_type = SOCK_RAW; 1781 1782 connp->conn_recv = icmp_input; 1783 connp->conn_recvicmp = icmp_icmp_input; 1784 crhold(credp); 1785 connp->conn_cred = credp; 1786 connp->conn_cpid = curproc->p_pid; 1787 connp->conn_open_time = ddi_get_lbolt64(); 1788 /* Cache things in ixa without an extra refhold */ 1789 ASSERT(!(connp->conn_ixa->ixa_free_flags & IXA_FREE_CRED)); 1790 connp->conn_ixa->ixa_cred = connp->conn_cred; 1791 connp->conn_ixa->ixa_cpid = connp->conn_cpid; 1792 if (is_system_labeled()) 1793 connp->conn_ixa->ixa_tsl = crgetlabel(connp->conn_cred); 1794 1795 connp->conn_flow_cntrld = B_FALSE; 1796 1797 if (is->is_pmtu_discovery) 1798 connp->conn_ixa->ixa_flags |= IXAF_PMTU_DISCOVERY; 1799 1800 return (connp); 1801 } 1802 1803 /* 1804 * Which ICMP options OK to set through T_UNITDATA_REQ... 1805 */ 1806 /* ARGSUSED */ 1807 static boolean_t 1808 icmp_opt_allow_udr_set(t_scalar_t level, t_scalar_t name) 1809 { 1810 return (B_TRUE); 1811 } 1812 1813 /* 1814 * This routine gets default values of certain options whose default 1815 * values are maintained by protcol specific code 1816 */ 1817 int 1818 icmp_opt_default(queue_t *q, t_scalar_t level, t_scalar_t name, uchar_t *ptr) 1819 { 1820 icmp_t *icmp = Q_TO_ICMP(q); 1821 icmp_stack_t *is = icmp->icmp_is; 1822 int *i1 = (int *)ptr; 1823 1824 switch (level) { 1825 case IPPROTO_IP: 1826 switch (name) { 1827 case IP_MULTICAST_TTL: 1828 *ptr = (uchar_t)IP_DEFAULT_MULTICAST_TTL; 1829 return (sizeof (uchar_t)); 1830 case IP_MULTICAST_LOOP: 1831 *ptr = (uchar_t)IP_DEFAULT_MULTICAST_LOOP; 1832 return (sizeof (uchar_t)); 1833 } 1834 break; 1835 case IPPROTO_IPV6: 1836 switch (name) { 1837 case IPV6_MULTICAST_HOPS: 1838 *i1 = IP_DEFAULT_MULTICAST_TTL; 1839 return (sizeof (int)); 1840 case IPV6_MULTICAST_LOOP: 1841 *i1 = IP_DEFAULT_MULTICAST_LOOP; 1842 return (sizeof (int)); 1843 case IPV6_UNICAST_HOPS: 1844 *i1 = is->is_ipv6_hoplimit; 1845 return (sizeof (int)); 1846 } 1847 break; 1848 case IPPROTO_ICMPV6: 1849 switch (name) { 1850 case ICMP6_FILTER: 1851 /* Make it look like "pass all" */ 1852 ICMP6_FILTER_SETPASSALL((icmp6_filter_t *)ptr); 1853 return (sizeof (icmp6_filter_t)); 1854 } 1855 break; 1856 } 1857 return (-1); 1858 } 1859 1860 /* 1861 * This routine retrieves the current status of socket options. 1862 * It returns the size of the option retrieved, or -1. 1863 */ 1864 int 1865 icmp_opt_get(conn_t *connp, int level, int name, uchar_t *ptr) 1866 { 1867 icmp_t *icmp = connp->conn_icmp; 1868 int *i1 = (int *)ptr; 1869 conn_opt_arg_t coas; 1870 int retval; 1871 1872 coas.coa_connp = connp; 1873 coas.coa_ixa = connp->conn_ixa; 1874 coas.coa_ipp = &connp->conn_xmit_ipp; 1875 coas.coa_ancillary = B_FALSE; 1876 coas.coa_changed = 0; 1877 1878 /* 1879 * We assume that the optcom framework has checked for the set 1880 * of levels and names that are supported, hence we don't worry 1881 * about rejecting based on that. 1882 * First check for ICMP specific handling, then pass to common routine. 1883 */ 1884 switch (level) { 1885 case IPPROTO_IP: 1886 /* 1887 * Only allow IPv4 option processing on IPv4 sockets. 1888 */ 1889 if (connp->conn_family != AF_INET) 1890 return (-1); 1891 1892 switch (name) { 1893 case IP_OPTIONS: 1894 case T_IP_OPTIONS: 1895 /* Options are passed up with each packet */ 1896 return (0); 1897 case IP_HDRINCL: 1898 mutex_enter(&connp->conn_lock); 1899 *i1 = (int)icmp->icmp_hdrincl; 1900 mutex_exit(&connp->conn_lock); 1901 return (sizeof (int)); 1902 } 1903 break; 1904 1905 case IPPROTO_IPV6: 1906 /* 1907 * Only allow IPv6 option processing on native IPv6 sockets. 1908 */ 1909 if (connp->conn_family != AF_INET6) 1910 return (-1); 1911 1912 switch (name) { 1913 case IPV6_CHECKSUM: 1914 /* 1915 * Return offset or -1 if no checksum offset. 1916 * Does not apply to IPPROTO_ICMPV6 1917 */ 1918 if (connp->conn_proto == IPPROTO_ICMPV6) 1919 return (-1); 1920 1921 mutex_enter(&connp->conn_lock); 1922 if (connp->conn_ixa->ixa_flags & IXAF_SET_RAW_CKSUM) 1923 *i1 = connp->conn_ixa->ixa_raw_cksum_offset; 1924 else 1925 *i1 = -1; 1926 mutex_exit(&connp->conn_lock); 1927 return (sizeof (int)); 1928 } 1929 break; 1930 1931 case IPPROTO_ICMPV6: 1932 /* 1933 * Only allow IPv6 option processing on native IPv6 sockets. 1934 */ 1935 if (connp->conn_family != AF_INET6) 1936 return (-1); 1937 1938 if (connp->conn_proto != IPPROTO_ICMPV6) 1939 return (-1); 1940 1941 switch (name) { 1942 case ICMP6_FILTER: 1943 mutex_enter(&connp->conn_lock); 1944 if (icmp->icmp_filter == NULL) { 1945 /* Make it look like "pass all" */ 1946 ICMP6_FILTER_SETPASSALL((icmp6_filter_t *)ptr); 1947 } else { 1948 (void) bcopy(icmp->icmp_filter, ptr, 1949 sizeof (icmp6_filter_t)); 1950 } 1951 mutex_exit(&connp->conn_lock); 1952 return (sizeof (icmp6_filter_t)); 1953 } 1954 } 1955 mutex_enter(&connp->conn_lock); 1956 retval = conn_opt_get(&coas, level, name, ptr); 1957 mutex_exit(&connp->conn_lock); 1958 return (retval); 1959 } 1960 1961 /* 1962 * This routine retrieves the current status of socket options. 1963 * It returns the size of the option retrieved, or -1. 1964 */ 1965 int 1966 icmp_tpi_opt_get(queue_t *q, int level, int name, uchar_t *ptr) 1967 { 1968 conn_t *connp = Q_TO_CONN(q); 1969 int err; 1970 1971 err = icmp_opt_get(connp, level, name, ptr); 1972 return (err); 1973 } 1974 1975 /* 1976 * This routine sets socket options. 1977 */ 1978 int 1979 icmp_do_opt_set(conn_opt_arg_t *coa, int level, int name, 1980 uint_t inlen, uchar_t *invalp, cred_t *cr, boolean_t checkonly) 1981 { 1982 conn_t *connp = coa->coa_connp; 1983 ip_xmit_attr_t *ixa = coa->coa_ixa; 1984 icmp_t *icmp = connp->conn_icmp; 1985 icmp_stack_t *is = icmp->icmp_is; 1986 int *i1 = (int *)invalp; 1987 boolean_t onoff = (*i1 == 0) ? 0 : 1; 1988 int error; 1989 1990 ASSERT(MUTEX_NOT_HELD(&coa->coa_connp->conn_lock)); 1991 1992 /* 1993 * For fixed length options, no sanity check 1994 * of passed in length is done. It is assumed *_optcom_req() 1995 * routines do the right thing. 1996 */ 1997 1998 switch (level) { 1999 case SOL_SOCKET: 2000 switch (name) { 2001 case SO_PROTOTYPE: 2002 if ((*i1 & 0xFF) != IPPROTO_ICMP && 2003 (*i1 & 0xFF) != IPPROTO_ICMPV6 && 2004 secpolicy_net_rawaccess(cr) != 0) { 2005 return (EACCES); 2006 } 2007 if (checkonly) 2008 break; 2009 2010 mutex_enter(&connp->conn_lock); 2011 connp->conn_proto = *i1 & 0xFF; 2012 ixa->ixa_protocol = connp->conn_proto; 2013 if ((connp->conn_proto == IPPROTO_RAW || 2014 connp->conn_proto == IPPROTO_IGMP) && 2015 connp->conn_family == AF_INET) { 2016 icmp->icmp_hdrincl = 1; 2017 ixa->ixa_flags &= ~IXAF_SET_ULP_CKSUM; 2018 } else if (connp->conn_proto == IPPROTO_UDP || 2019 connp->conn_proto == IPPROTO_TCP || 2020 connp->conn_proto == IPPROTO_SCTP) { 2021 /* Used by test applications like psh */ 2022 icmp->icmp_hdrincl = 0; 2023 ixa->ixa_flags &= ~IXAF_SET_ULP_CKSUM; 2024 } else { 2025 icmp->icmp_hdrincl = 0; 2026 ixa->ixa_flags |= IXAF_SET_ULP_CKSUM; 2027 } 2028 2029 if (connp->conn_family == AF_INET6 && 2030 connp->conn_proto == IPPROTO_ICMPV6) { 2031 /* Set offset for icmp6_cksum */ 2032 ixa->ixa_flags &= ~IXAF_SET_RAW_CKSUM; 2033 ixa->ixa_raw_cksum_offset = 2; 2034 } 2035 if (icmp->icmp_filter != NULL && 2036 connp->conn_proto != IPPROTO_ICMPV6) { 2037 kmem_free(icmp->icmp_filter, 2038 sizeof (icmp6_filter_t)); 2039 icmp->icmp_filter = NULL; 2040 } 2041 mutex_exit(&connp->conn_lock); 2042 2043 coa->coa_changed |= COA_HEADER_CHANGED; 2044 /* 2045 * For SCTP, we don't use icmp_bind_proto() for 2046 * raw socket binding. 2047 */ 2048 if (connp->conn_proto == IPPROTO_SCTP) 2049 return (0); 2050 2051 coa->coa_changed |= COA_ICMP_BIND_NEEDED; 2052 return (0); 2053 2054 case SO_SNDBUF: 2055 if (*i1 > is->is_max_buf) { 2056 return (ENOBUFS); 2057 } 2058 break; 2059 case SO_RCVBUF: 2060 if (*i1 > is->is_max_buf) { 2061 return (ENOBUFS); 2062 } 2063 break; 2064 } 2065 break; 2066 2067 case IPPROTO_IP: 2068 /* 2069 * Only allow IPv4 option processing on IPv4 sockets. 2070 */ 2071 if (connp->conn_family != AF_INET) 2072 return (EINVAL); 2073 2074 switch (name) { 2075 case IP_HDRINCL: 2076 if (!checkonly) { 2077 mutex_enter(&connp->conn_lock); 2078 icmp->icmp_hdrincl = onoff; 2079 if (onoff) 2080 ixa->ixa_flags &= ~IXAF_SET_ULP_CKSUM; 2081 else 2082 ixa->ixa_flags |= IXAF_SET_ULP_CKSUM; 2083 mutex_exit(&connp->conn_lock); 2084 } 2085 break; 2086 } 2087 break; 2088 2089 case IPPROTO_IPV6: 2090 if (connp->conn_family != AF_INET6) 2091 return (EINVAL); 2092 2093 switch (name) { 2094 case IPV6_CHECKSUM: 2095 /* 2096 * Integer offset into the user data of where the 2097 * checksum is located. 2098 * Offset of -1 disables option. 2099 * Does not apply to IPPROTO_ICMPV6. 2100 */ 2101 if (connp->conn_proto == IPPROTO_ICMPV6 || 2102 coa->coa_ancillary) { 2103 return (EINVAL); 2104 } 2105 if ((*i1 != -1) && ((*i1 < 0) || (*i1 & 0x1) != 0)) { 2106 /* Negative or not 16 bit aligned offset */ 2107 return (EINVAL); 2108 } 2109 if (checkonly) 2110 break; 2111 2112 mutex_enter(&connp->conn_lock); 2113 if (*i1 == -1) { 2114 ixa->ixa_flags &= ~IXAF_SET_RAW_CKSUM; 2115 ixa->ixa_raw_cksum_offset = 0; 2116 ixa->ixa_flags &= ~IXAF_SET_ULP_CKSUM; 2117 } else { 2118 ixa->ixa_flags |= IXAF_SET_RAW_CKSUM; 2119 ixa->ixa_raw_cksum_offset = *i1; 2120 ixa->ixa_flags |= IXAF_SET_ULP_CKSUM; 2121 } 2122 mutex_exit(&connp->conn_lock); 2123 break; 2124 } 2125 break; 2126 2127 case IPPROTO_ICMPV6: 2128 /* 2129 * Only allow IPv6 option processing on IPv6 sockets. 2130 */ 2131 if (connp->conn_family != AF_INET6) 2132 return (EINVAL); 2133 if (connp->conn_proto != IPPROTO_ICMPV6) 2134 return (EINVAL); 2135 2136 switch (name) { 2137 case ICMP6_FILTER: 2138 if (checkonly) 2139 break; 2140 2141 if ((inlen != 0) && 2142 (inlen != sizeof (icmp6_filter_t))) 2143 return (EINVAL); 2144 2145 mutex_enter(&connp->conn_lock); 2146 if (inlen == 0) { 2147 if (icmp->icmp_filter != NULL) { 2148 kmem_free(icmp->icmp_filter, 2149 sizeof (icmp6_filter_t)); 2150 icmp->icmp_filter = NULL; 2151 } 2152 } else { 2153 if (icmp->icmp_filter == NULL) { 2154 icmp->icmp_filter = kmem_alloc( 2155 sizeof (icmp6_filter_t), 2156 KM_NOSLEEP); 2157 if (icmp->icmp_filter == NULL) { 2158 mutex_exit(&connp->conn_lock); 2159 return (ENOBUFS); 2160 } 2161 } 2162 (void) bcopy(invalp, icmp->icmp_filter, inlen); 2163 } 2164 mutex_exit(&connp->conn_lock); 2165 break; 2166 } 2167 break; 2168 } 2169 error = conn_opt_set(coa, level, name, inlen, invalp, 2170 checkonly, cr); 2171 return (error); 2172 } 2173 2174 /* 2175 * This routine sets socket options. 2176 */ 2177 int 2178 icmp_opt_set(conn_t *connp, uint_t optset_context, int level, int name, 2179 uint_t inlen, uchar_t *invalp, uint_t *outlenp, uchar_t *outvalp, 2180 void *thisdg_attrs, cred_t *cr) 2181 { 2182 icmp_t *icmp = connp->conn_icmp; 2183 int err; 2184 conn_opt_arg_t coas, *coa; 2185 boolean_t checkonly; 2186 icmp_stack_t *is = icmp->icmp_is; 2187 2188 switch (optset_context) { 2189 case SETFN_OPTCOM_CHECKONLY: 2190 checkonly = B_TRUE; 2191 /* 2192 * Note: Implies T_CHECK semantics for T_OPTCOM_REQ 2193 * inlen != 0 implies value supplied and 2194 * we have to "pretend" to set it. 2195 * inlen == 0 implies that there is no 2196 * value part in T_CHECK request and just validation 2197 * done elsewhere should be enough, we just return here. 2198 */ 2199 if (inlen == 0) { 2200 *outlenp = 0; 2201 return (0); 2202 } 2203 break; 2204 case SETFN_OPTCOM_NEGOTIATE: 2205 checkonly = B_FALSE; 2206 break; 2207 case SETFN_UD_NEGOTIATE: 2208 case SETFN_CONN_NEGOTIATE: 2209 checkonly = B_FALSE; 2210 /* 2211 * Negotiating local and "association-related" options 2212 * through T_UNITDATA_REQ. 2213 * 2214 * Following routine can filter out ones we do not 2215 * want to be "set" this way. 2216 */ 2217 if (!icmp_opt_allow_udr_set(level, name)) { 2218 *outlenp = 0; 2219 return (EINVAL); 2220 } 2221 break; 2222 default: 2223 /* 2224 * We should never get here 2225 */ 2226 *outlenp = 0; 2227 return (EINVAL); 2228 } 2229 2230 ASSERT((optset_context != SETFN_OPTCOM_CHECKONLY) || 2231 (optset_context == SETFN_OPTCOM_CHECKONLY && inlen != 0)); 2232 2233 if (thisdg_attrs != NULL) { 2234 /* Options from T_UNITDATA_REQ */ 2235 coa = (conn_opt_arg_t *)thisdg_attrs; 2236 ASSERT(coa->coa_connp == connp); 2237 ASSERT(coa->coa_ixa != NULL); 2238 ASSERT(coa->coa_ipp != NULL); 2239 ASSERT(coa->coa_ancillary); 2240 } else { 2241 coa = &coas; 2242 coas.coa_connp = connp; 2243 /* Get a reference on conn_ixa to prevent concurrent mods */ 2244 coas.coa_ixa = conn_get_ixa(connp, B_TRUE); 2245 if (coas.coa_ixa == NULL) { 2246 *outlenp = 0; 2247 return (ENOMEM); 2248 } 2249 coas.coa_ipp = &connp->conn_xmit_ipp; 2250 coas.coa_ancillary = B_FALSE; 2251 coas.coa_changed = 0; 2252 } 2253 2254 err = icmp_do_opt_set(coa, level, name, inlen, invalp, 2255 cr, checkonly); 2256 if (err != 0) { 2257 errout: 2258 if (!coa->coa_ancillary) 2259 ixa_refrele(coa->coa_ixa); 2260 *outlenp = 0; 2261 return (err); 2262 } 2263 2264 /* 2265 * Common case of OK return with outval same as inval. 2266 */ 2267 if (invalp != outvalp) { 2268 /* don't trust bcopy for identical src/dst */ 2269 (void) bcopy(invalp, outvalp, inlen); 2270 } 2271 *outlenp = inlen; 2272 2273 /* 2274 * If this was not ancillary data, then we rebuild the headers, 2275 * update the IRE/NCE, and IPsec as needed. 2276 * Since the label depends on the destination we go through 2277 * ip_set_destination first. 2278 */ 2279 if (coa->coa_ancillary) { 2280 return (0); 2281 } 2282 2283 if (coa->coa_changed & COA_ROUTE_CHANGED) { 2284 in6_addr_t saddr, faddr, nexthop; 2285 in_port_t fport; 2286 2287 /* 2288 * We clear lastdst to make sure we pick up the change 2289 * next time sending. 2290 * If we are connected we re-cache the information. 2291 * We ignore errors to preserve BSD behavior. 2292 * Note that we don't redo IPsec policy lookup here 2293 * since the final destination (or source) didn't change. 2294 */ 2295 mutex_enter(&connp->conn_lock); 2296 connp->conn_v6lastdst = ipv6_all_zeros; 2297 2298 ip_attr_nexthop(coa->coa_ipp, coa->coa_ixa, 2299 &connp->conn_faddr_v6, &nexthop); 2300 saddr = connp->conn_saddr_v6; 2301 faddr = connp->conn_faddr_v6; 2302 fport = connp->conn_fport; 2303 mutex_exit(&connp->conn_lock); 2304 2305 if (!IN6_IS_ADDR_UNSPECIFIED(&faddr) && 2306 !IN6_IS_ADDR_V4MAPPED_ANY(&faddr)) { 2307 (void) ip_attr_connect(connp, coa->coa_ixa, 2308 &saddr, &faddr, &nexthop, fport, NULL, NULL, 2309 IPDF_ALLOW_MCBC | IPDF_VERIFY_DST); 2310 } 2311 } 2312 2313 ixa_refrele(coa->coa_ixa); 2314 2315 if (coa->coa_changed & COA_HEADER_CHANGED) { 2316 /* 2317 * Rebuild the header template if we are connected. 2318 * Otherwise clear conn_v6lastdst so we rebuild the header 2319 * in the data path. 2320 */ 2321 mutex_enter(&connp->conn_lock); 2322 if (!IN6_IS_ADDR_UNSPECIFIED(&connp->conn_faddr_v6) && 2323 !IN6_IS_ADDR_V4MAPPED_ANY(&connp->conn_faddr_v6)) { 2324 err = icmp_build_hdr_template(connp, 2325 &connp->conn_saddr_v6, &connp->conn_faddr_v6, 2326 connp->conn_flowinfo); 2327 if (err != 0) { 2328 mutex_exit(&connp->conn_lock); 2329 return (err); 2330 } 2331 } else { 2332 connp->conn_v6lastdst = ipv6_all_zeros; 2333 } 2334 mutex_exit(&connp->conn_lock); 2335 } 2336 if (coa->coa_changed & COA_RCVBUF_CHANGED) { 2337 (void) proto_set_rx_hiwat(connp->conn_rq, connp, 2338 connp->conn_rcvbuf); 2339 } 2340 if ((coa->coa_changed & COA_SNDBUF_CHANGED) && !IPCL_IS_NONSTR(connp)) { 2341 connp->conn_wq->q_hiwat = connp->conn_sndbuf; 2342 } 2343 if (coa->coa_changed & COA_WROFF_CHANGED) { 2344 /* Increase wroff if needed */ 2345 uint_t wroff; 2346 2347 mutex_enter(&connp->conn_lock); 2348 wroff = connp->conn_ht_iphc_allocated + is->is_wroff_extra; 2349 if (wroff > connp->conn_wroff) { 2350 connp->conn_wroff = wroff; 2351 mutex_exit(&connp->conn_lock); 2352 (void) proto_set_tx_wroff(connp->conn_rq, connp, wroff); 2353 } else { 2354 mutex_exit(&connp->conn_lock); 2355 } 2356 } 2357 if (coa->coa_changed & COA_ICMP_BIND_NEEDED) { 2358 icmp_bind_proto(icmp); 2359 } 2360 return (err); 2361 } 2362 2363 /* This routine sets socket options. */ 2364 int 2365 icmp_tpi_opt_set(queue_t *q, uint_t optset_context, int level, int name, 2366 uint_t inlen, uchar_t *invalp, uint_t *outlenp, uchar_t *outvalp, 2367 void *thisdg_attrs, cred_t *cr) 2368 { 2369 conn_t *connp = Q_TO_CONN(q); 2370 int error; 2371 2372 error = icmp_opt_set(connp, optset_context, level, name, inlen, invalp, 2373 outlenp, outvalp, thisdg_attrs, cr); 2374 return (error); 2375 } 2376 2377 /* 2378 * Setup IP headers. 2379 * 2380 * Note that IP_HDRINCL has ipha_protocol that is different than conn_proto, 2381 * but icmp_output_hdrincl restores ipha_protocol once we return. 2382 */ 2383 mblk_t * 2384 icmp_prepend_hdr(conn_t *connp, ip_xmit_attr_t *ixa, const ip_pkt_t *ipp, 2385 const in6_addr_t *v6src, const in6_addr_t *v6dst, uint32_t flowinfo, 2386 mblk_t *data_mp, int *errorp) 2387 { 2388 mblk_t *mp; 2389 icmp_stack_t *is = connp->conn_netstack->netstack_icmp; 2390 uint_t data_len; 2391 uint32_t cksum; 2392 2393 data_len = msgdsize(data_mp); 2394 mp = conn_prepend_hdr(ixa, ipp, v6src, v6dst, connp->conn_proto, 2395 flowinfo, 0, data_mp, data_len, is->is_wroff_extra, &cksum, errorp); 2396 if (mp == NULL) { 2397 ASSERT(*errorp != 0); 2398 return (NULL); 2399 } 2400 2401 ixa->ixa_pktlen = data_len + ixa->ixa_ip_hdr_length; 2402 2403 /* 2404 * If there was a routing option/header then conn_prepend_hdr 2405 * has massaged it and placed the pseudo-header checksum difference 2406 * in the cksum argument. 2407 * 2408 * Prepare for ICMPv6 checksum done in IP. 2409 * 2410 * We make it easy for IP to include our pseudo header 2411 * by putting our length (and any routing header adjustment) 2412 * in the ICMPv6 checksum field. 2413 * The IP source, destination, and length have already been set by 2414 * conn_prepend_hdr. 2415 */ 2416 cksum += data_len; 2417 cksum = (cksum >> 16) + (cksum & 0xFFFF); 2418 ASSERT(cksum < 0x10000); 2419 2420 if (ixa->ixa_flags & IXAF_IS_IPV4) { 2421 ipha_t *ipha = (ipha_t *)mp->b_rptr; 2422 2423 ASSERT(ntohs(ipha->ipha_length) == ixa->ixa_pktlen); 2424 } else { 2425 ip6_t *ip6h = (ip6_t *)mp->b_rptr; 2426 uint_t cksum_offset = 0; 2427 2428 ASSERT(ntohs(ip6h->ip6_plen) + IPV6_HDR_LEN == ixa->ixa_pktlen); 2429 2430 if (ixa->ixa_flags & IXAF_SET_ULP_CKSUM) { 2431 if (connp->conn_proto == IPPROTO_ICMPV6) { 2432 cksum_offset = ixa->ixa_ip_hdr_length + 2433 offsetof(icmp6_t, icmp6_cksum); 2434 } else if (ixa->ixa_flags & IXAF_SET_RAW_CKSUM) { 2435 cksum_offset = ixa->ixa_ip_hdr_length + 2436 ixa->ixa_raw_cksum_offset; 2437 } 2438 } 2439 if (cksum_offset != 0) { 2440 uint16_t *ptr; 2441 2442 /* Make sure the checksum fits in the first mblk */ 2443 if (cksum_offset + sizeof (short) > MBLKL(mp)) { 2444 mblk_t *mp1; 2445 2446 mp1 = msgpullup(mp, 2447 cksum_offset + sizeof (short)); 2448 freemsg(mp); 2449 if (mp1 == NULL) { 2450 *errorp = ENOMEM; 2451 return (NULL); 2452 } 2453 mp = mp1; 2454 ip6h = (ip6_t *)mp->b_rptr; 2455 } 2456 ptr = (uint16_t *)(mp->b_rptr + cksum_offset); 2457 *ptr = htons(cksum); 2458 } 2459 } 2460 2461 /* Note that we don't try to update wroff due to ancillary data */ 2462 return (mp); 2463 } 2464 2465 static int 2466 icmp_build_hdr_template(conn_t *connp, const in6_addr_t *v6src, 2467 const in6_addr_t *v6dst, uint32_t flowinfo) 2468 { 2469 int error; 2470 2471 ASSERT(MUTEX_HELD(&connp->conn_lock)); 2472 /* 2473 * We clear lastdst to make sure we don't use the lastdst path 2474 * next time sending since we might not have set v6dst yet. 2475 */ 2476 connp->conn_v6lastdst = ipv6_all_zeros; 2477 2478 error = conn_build_hdr_template(connp, 0, 0, v6src, v6dst, flowinfo); 2479 if (error != 0) 2480 return (error); 2481 2482 /* 2483 * Any routing header/option has been massaged. The checksum difference 2484 * is stored in conn_sum. 2485 */ 2486 return (0); 2487 } 2488 2489 static mblk_t * 2490 icmp_queue_fallback(icmp_t *icmp, mblk_t *mp) 2491 { 2492 ASSERT(MUTEX_HELD(&icmp->icmp_recv_lock)); 2493 if (IPCL_IS_NONSTR(icmp->icmp_connp)) { 2494 /* 2495 * fallback has started but messages have not been moved yet 2496 */ 2497 if (icmp->icmp_fallback_queue_head == NULL) { 2498 ASSERT(icmp->icmp_fallback_queue_tail == NULL); 2499 icmp->icmp_fallback_queue_head = mp; 2500 icmp->icmp_fallback_queue_tail = mp; 2501 } else { 2502 ASSERT(icmp->icmp_fallback_queue_tail != NULL); 2503 icmp->icmp_fallback_queue_tail->b_next = mp; 2504 icmp->icmp_fallback_queue_tail = mp; 2505 } 2506 return (NULL); 2507 } else { 2508 /* 2509 * Fallback completed, let the caller putnext() the mblk. 2510 */ 2511 return (mp); 2512 } 2513 } 2514 2515 /* 2516 * Deliver data to ULP. In case we have a socket, and it's falling back to 2517 * TPI, then we'll queue the mp for later processing. 2518 */ 2519 static void 2520 icmp_ulp_recv(conn_t *connp, mblk_t *mp, uint_t len) 2521 { 2522 if (IPCL_IS_NONSTR(connp)) { 2523 icmp_t *icmp = connp->conn_icmp; 2524 int error; 2525 2526 ASSERT(len == msgdsize(mp)); 2527 if ((*connp->conn_upcalls->su_recv) 2528 (connp->conn_upper_handle, mp, len, 0, &error, NULL) < 0) { 2529 mutex_enter(&icmp->icmp_recv_lock); 2530 if (error == ENOSPC) { 2531 /* 2532 * let's confirm while holding the lock 2533 */ 2534 if ((*connp->conn_upcalls->su_recv) 2535 (connp->conn_upper_handle, NULL, 0, 0, 2536 &error, NULL) < 0) { 2537 ASSERT(error == ENOSPC); 2538 if (error == ENOSPC) { 2539 connp->conn_flow_cntrld = 2540 B_TRUE; 2541 } 2542 } 2543 mutex_exit(&icmp->icmp_recv_lock); 2544 } else { 2545 ASSERT(error == EOPNOTSUPP); 2546 mp = icmp_queue_fallback(icmp, mp); 2547 mutex_exit(&icmp->icmp_recv_lock); 2548 if (mp != NULL) 2549 putnext(connp->conn_rq, mp); 2550 } 2551 } 2552 ASSERT(MUTEX_NOT_HELD(&icmp->icmp_recv_lock)); 2553 } else { 2554 putnext(connp->conn_rq, mp); 2555 } 2556 } 2557 2558 /* 2559 * This is the inbound data path. 2560 * IP has already pulled up the IP headers and verified alignment 2561 * etc. 2562 */ 2563 /* ARGSUSED2 */ 2564 static void 2565 icmp_input(void *arg1, mblk_t *mp, void *arg2, ip_recv_attr_t *ira) 2566 { 2567 conn_t *connp = (conn_t *)arg1; 2568 struct T_unitdata_ind *tudi; 2569 uchar_t *rptr; /* Pointer to IP header */ 2570 int ip_hdr_length; 2571 int udi_size; /* Size of T_unitdata_ind */ 2572 int pkt_len; 2573 icmp_t *icmp; 2574 ip_pkt_t ipps; 2575 ip6_t *ip6h; 2576 mblk_t *mp1; 2577 crb_t recv_ancillary; 2578 icmp_stack_t *is; 2579 sin_t *sin; 2580 sin6_t *sin6; 2581 ipha_t *ipha; 2582 2583 ASSERT(connp->conn_flags & IPCL_RAWIPCONN); 2584 2585 icmp = connp->conn_icmp; 2586 is = icmp->icmp_is; 2587 rptr = mp->b_rptr; 2588 2589 ASSERT(DB_TYPE(mp) == M_DATA); 2590 ASSERT(OK_32PTR(rptr)); 2591 ASSERT(ira->ira_pktlen == msgdsize(mp)); 2592 pkt_len = ira->ira_pktlen; 2593 2594 /* 2595 * Get a snapshot of these and allow other threads to change 2596 * them after that. We need the same recv_ancillary when determining 2597 * the size as when adding the ancillary data items. 2598 */ 2599 mutex_enter(&connp->conn_lock); 2600 recv_ancillary = connp->conn_recv_ancillary; 2601 mutex_exit(&connp->conn_lock); 2602 2603 ip_hdr_length = ira->ira_ip_hdr_length; 2604 ASSERT(MBLKL(mp) >= ip_hdr_length); /* IP did a pullup */ 2605 2606 /* Initialize regardless of IP version */ 2607 ipps.ipp_fields = 0; 2608 2609 if (ira->ira_flags & IRAF_IS_IPV4) { 2610 ASSERT(IPH_HDR_VERSION(rptr) == IPV4_VERSION); 2611 ASSERT(MBLKL(mp) >= sizeof (ipha_t)); 2612 ASSERT(ira->ira_ip_hdr_length == IPH_HDR_LENGTH(rptr)); 2613 2614 ipha = (ipha_t *)mp->b_rptr; 2615 if (recv_ancillary.crb_all != 0) 2616 (void) ip_find_hdr_v4(ipha, &ipps, B_FALSE); 2617 2618 /* 2619 * BSD for some reason adjusts ipha_length to exclude the 2620 * IP header length. We do the same. 2621 */ 2622 if (is->is_bsd_compat) { 2623 ushort_t len; 2624 2625 len = ntohs(ipha->ipha_length); 2626 if (mp->b_datap->db_ref > 1) { 2627 /* 2628 * Allocate a new IP header so that we can 2629 * modify ipha_length. 2630 */ 2631 mblk_t *mp1; 2632 2633 mp1 = allocb(ip_hdr_length, BPRI_MED); 2634 if (mp1 == NULL) { 2635 freemsg(mp); 2636 BUMP_MIB(&is->is_rawip_mib, 2637 rawipInErrors); 2638 return; 2639 } 2640 bcopy(rptr, mp1->b_rptr, ip_hdr_length); 2641 mp->b_rptr = rptr + ip_hdr_length; 2642 rptr = mp1->b_rptr; 2643 ipha = (ipha_t *)rptr; 2644 mp1->b_cont = mp; 2645 mp1->b_wptr = rptr + ip_hdr_length; 2646 mp = mp1; 2647 } 2648 len -= ip_hdr_length; 2649 ipha->ipha_length = htons(len); 2650 } 2651 2652 /* 2653 * For RAW sockets we not pass ICMP/IPv4 packets to AF_INET6 2654 * sockets. This is ensured by icmp_bind and the IP fanout code. 2655 */ 2656 ASSERT(connp->conn_family == AF_INET); 2657 2658 /* 2659 * This is the inbound data path. Packets are passed upstream 2660 * as T_UNITDATA_IND messages with full IPv4 headers still 2661 * attached. 2662 */ 2663 2664 /* 2665 * Normally only send up the source address. 2666 * If any ancillary data items are wanted we add those. 2667 */ 2668 udi_size = sizeof (struct T_unitdata_ind) + sizeof (sin_t); 2669 if (recv_ancillary.crb_all != 0) { 2670 udi_size += conn_recvancillary_size(connp, 2671 recv_ancillary, ira, mp, &ipps); 2672 } 2673 2674 /* Allocate a message block for the T_UNITDATA_IND structure. */ 2675 mp1 = allocb(udi_size, BPRI_MED); 2676 if (mp1 == NULL) { 2677 freemsg(mp); 2678 BUMP_MIB(&is->is_rawip_mib, rawipInErrors); 2679 return; 2680 } 2681 mp1->b_cont = mp; 2682 tudi = (struct T_unitdata_ind *)mp1->b_rptr; 2683 mp1->b_datap->db_type = M_PROTO; 2684 mp1->b_wptr = (uchar_t *)tudi + udi_size; 2685 tudi->PRIM_type = T_UNITDATA_IND; 2686 tudi->SRC_length = sizeof (sin_t); 2687 tudi->SRC_offset = sizeof (struct T_unitdata_ind); 2688 sin = (sin_t *)&tudi[1]; 2689 *sin = sin_null; 2690 sin->sin_family = AF_INET; 2691 sin->sin_addr.s_addr = ipha->ipha_src; 2692 *(uint32_t *)&sin->sin_zero[0] = 0; 2693 *(uint32_t *)&sin->sin_zero[4] = 0; 2694 tudi->OPT_offset = sizeof (struct T_unitdata_ind) + 2695 sizeof (sin_t); 2696 udi_size -= (sizeof (struct T_unitdata_ind) + sizeof (sin_t)); 2697 tudi->OPT_length = udi_size; 2698 2699 /* 2700 * Add options if IP_RECVIF etc is set 2701 */ 2702 if (udi_size != 0) { 2703 conn_recvancillary_add(connp, recv_ancillary, ira, 2704 &ipps, (uchar_t *)&sin[1], udi_size); 2705 } 2706 goto deliver; 2707 } 2708 2709 ASSERT(IPH_HDR_VERSION(rptr) == IPV6_VERSION); 2710 /* 2711 * IPv6 packets can only be received by applications 2712 * that are prepared to receive IPv6 addresses. 2713 * The IP fanout must ensure this. 2714 */ 2715 ASSERT(connp->conn_family == AF_INET6); 2716 2717 /* 2718 * Handle IPv6 packets. We don't pass up the IP headers with the 2719 * payload for IPv6. 2720 */ 2721 2722 ip6h = (ip6_t *)rptr; 2723 if (recv_ancillary.crb_all != 0) { 2724 /* 2725 * Call on ip_find_hdr_v6 which gets individual lenghts of 2726 * extension headers (and pointers to them). 2727 */ 2728 uint8_t nexthdr; 2729 2730 /* We don't care about the length or nextheader. */ 2731 (void) ip_find_hdr_v6(mp, ip6h, B_TRUE, &ipps, &nexthdr); 2732 2733 /* 2734 * We do not pass up hop-by-hop options or any other 2735 * extension header as part of the packet. Applications 2736 * that want to see them have to specify IPV6_RECV* socket 2737 * options. And conn_recvancillary_size/add explicitly 2738 * drops the TX option from IPV6_HOPOPTS as it does for UDP. 2739 * 2740 * If we had multilevel ICMP sockets, then we'd want to 2741 * modify conn_recvancillary_size/add to 2742 * allow the user to see the label. 2743 */ 2744 } 2745 2746 /* 2747 * Check a filter for ICMPv6 types if needed. 2748 * Verify raw checksums if needed. 2749 */ 2750 mutex_enter(&connp->conn_lock); 2751 if (icmp->icmp_filter != NULL) { 2752 int type; 2753 2754 /* Assumes that IP has done the pullupmsg */ 2755 type = mp->b_rptr[ip_hdr_length]; 2756 2757 ASSERT(mp->b_rptr + ip_hdr_length <= mp->b_wptr); 2758 if (ICMP6_FILTER_WILLBLOCK(type, icmp->icmp_filter)) { 2759 mutex_exit(&connp->conn_lock); 2760 freemsg(mp); 2761 return; 2762 } 2763 } 2764 if (connp->conn_ixa->ixa_flags & IXAF_SET_RAW_CKSUM) { 2765 /* Checksum */ 2766 uint16_t *up; 2767 uint32_t sum; 2768 int remlen; 2769 2770 up = (uint16_t *)&ip6h->ip6_src; 2771 2772 remlen = msgdsize(mp) - ip_hdr_length; 2773 sum = htons(connp->conn_proto + remlen) 2774 + up[0] + up[1] + up[2] + up[3] 2775 + up[4] + up[5] + up[6] + up[7] 2776 + up[8] + up[9] + up[10] + up[11] 2777 + up[12] + up[13] + up[14] + up[15]; 2778 sum = (sum & 0xffff) + (sum >> 16); 2779 sum = IP_CSUM(mp, ip_hdr_length, sum); 2780 if (sum != 0) { 2781 /* IPv6 RAW checksum failed */ 2782 ip0dbg(("icmp_rput: RAW checksum failed %x\n", sum)); 2783 mutex_exit(&connp->conn_lock); 2784 freemsg(mp); 2785 BUMP_MIB(&is->is_rawip_mib, rawipInCksumErrs); 2786 return; 2787 } 2788 } 2789 mutex_exit(&connp->conn_lock); 2790 2791 udi_size = sizeof (struct T_unitdata_ind) + sizeof (sin6_t); 2792 2793 if (recv_ancillary.crb_all != 0) { 2794 udi_size += conn_recvancillary_size(connp, 2795 recv_ancillary, ira, mp, &ipps); 2796 } 2797 2798 mp1 = allocb(udi_size, BPRI_MED); 2799 if (mp1 == NULL) { 2800 freemsg(mp); 2801 BUMP_MIB(&is->is_rawip_mib, rawipInErrors); 2802 return; 2803 } 2804 mp1->b_cont = mp; 2805 mp1->b_datap->db_type = M_PROTO; 2806 tudi = (struct T_unitdata_ind *)mp1->b_rptr; 2807 mp1->b_wptr = (uchar_t *)tudi + udi_size; 2808 tudi->PRIM_type = T_UNITDATA_IND; 2809 tudi->SRC_length = sizeof (sin6_t); 2810 tudi->SRC_offset = sizeof (struct T_unitdata_ind); 2811 tudi->OPT_offset = sizeof (struct T_unitdata_ind) + sizeof (sin6_t); 2812 udi_size -= (sizeof (struct T_unitdata_ind) + sizeof (sin6_t)); 2813 tudi->OPT_length = udi_size; 2814 sin6 = (sin6_t *)&tudi[1]; 2815 *sin6 = sin6_null; 2816 sin6->sin6_port = 0; 2817 sin6->sin6_family = AF_INET6; 2818 2819 sin6->sin6_addr = ip6h->ip6_src; 2820 /* No sin6_flowinfo per API */ 2821 sin6->sin6_flowinfo = 0; 2822 /* For link-scope pass up scope id */ 2823 if (IN6_IS_ADDR_LINKSCOPE(&ip6h->ip6_src)) 2824 sin6->sin6_scope_id = ira->ira_ruifindex; 2825 else 2826 sin6->sin6_scope_id = 0; 2827 sin6->__sin6_src_id = ip_srcid_find_addr(&ip6h->ip6_dst, 2828 IPCL_ZONEID(connp), is->is_netstack); 2829 2830 if (udi_size != 0) { 2831 conn_recvancillary_add(connp, recv_ancillary, ira, 2832 &ipps, (uchar_t *)&sin6[1], udi_size); 2833 } 2834 2835 /* Skip all the IPv6 headers per API */ 2836 mp->b_rptr += ip_hdr_length; 2837 pkt_len -= ip_hdr_length; 2838 2839 deliver: 2840 BUMP_MIB(&is->is_rawip_mib, rawipInDatagrams); 2841 icmp_ulp_recv(connp, mp1, pkt_len); 2842 } 2843 2844 /* 2845 * return SNMP stuff in buffer in mpdata. We don't hold any lock and report 2846 * information that can be changing beneath us. 2847 */ 2848 mblk_t * 2849 icmp_snmp_get(queue_t *q, mblk_t *mpctl) 2850 { 2851 mblk_t *mpdata; 2852 struct opthdr *optp; 2853 conn_t *connp = Q_TO_CONN(q); 2854 icmp_stack_t *is = connp->conn_netstack->netstack_icmp; 2855 mblk_t *mp2ctl; 2856 2857 /* 2858 * make a copy of the original message 2859 */ 2860 mp2ctl = copymsg(mpctl); 2861 2862 if (mpctl == NULL || 2863 (mpdata = mpctl->b_cont) == NULL) { 2864 freemsg(mpctl); 2865 freemsg(mp2ctl); 2866 return (0); 2867 } 2868 2869 /* fixed length structure for IPv4 and IPv6 counters */ 2870 optp = (struct opthdr *)&mpctl->b_rptr[sizeof (struct T_optmgmt_ack)]; 2871 optp->level = EXPER_RAWIP; 2872 optp->name = 0; 2873 (void) snmp_append_data(mpdata, (char *)&is->is_rawip_mib, 2874 sizeof (is->is_rawip_mib)); 2875 optp->len = msgdsize(mpdata); 2876 qreply(q, mpctl); 2877 2878 return (mp2ctl); 2879 } 2880 2881 /* 2882 * Return 0 if invalid set request, 1 otherwise, including non-rawip requests. 2883 * TODO: If this ever actually tries to set anything, it needs to be 2884 * to do the appropriate locking. 2885 */ 2886 /* ARGSUSED */ 2887 int 2888 icmp_snmp_set(queue_t *q, t_scalar_t level, t_scalar_t name, 2889 uchar_t *ptr, int len) 2890 { 2891 switch (level) { 2892 case EXPER_RAWIP: 2893 return (0); 2894 default: 2895 return (1); 2896 } 2897 } 2898 2899 /* 2900 * This routine creates a T_UDERROR_IND message and passes it upstream. 2901 * The address and options are copied from the T_UNITDATA_REQ message 2902 * passed in mp. This message is freed. 2903 */ 2904 static void 2905 icmp_ud_err(queue_t *q, mblk_t *mp, t_scalar_t err) 2906 { 2907 struct T_unitdata_req *tudr; 2908 mblk_t *mp1; 2909 uchar_t *destaddr; 2910 t_scalar_t destlen; 2911 uchar_t *optaddr; 2912 t_scalar_t optlen; 2913 2914 if ((mp->b_wptr < mp->b_rptr) || 2915 (MBLKL(mp)) < sizeof (struct T_unitdata_req)) { 2916 goto done; 2917 } 2918 tudr = (struct T_unitdata_req *)mp->b_rptr; 2919 destaddr = mp->b_rptr + tudr->DEST_offset; 2920 if (destaddr < mp->b_rptr || destaddr >= mp->b_wptr || 2921 destaddr + tudr->DEST_length < mp->b_rptr || 2922 destaddr + tudr->DEST_length > mp->b_wptr) { 2923 goto done; 2924 } 2925 optaddr = mp->b_rptr + tudr->OPT_offset; 2926 if (optaddr < mp->b_rptr || optaddr >= mp->b_wptr || 2927 optaddr + tudr->OPT_length < mp->b_rptr || 2928 optaddr + tudr->OPT_length > mp->b_wptr) { 2929 goto done; 2930 } 2931 destlen = tudr->DEST_length; 2932 optlen = tudr->OPT_length; 2933 2934 mp1 = mi_tpi_uderror_ind((char *)destaddr, destlen, 2935 (char *)optaddr, optlen, err); 2936 if (mp1 != NULL) 2937 qreply(q, mp1); 2938 2939 done: 2940 freemsg(mp); 2941 } 2942 2943 static int 2944 rawip_do_unbind(conn_t *connp) 2945 { 2946 icmp_t *icmp = connp->conn_icmp; 2947 2948 mutex_enter(&connp->conn_lock); 2949 /* If a bind has not been done, we can't unbind. */ 2950 if (icmp->icmp_state == TS_UNBND) { 2951 mutex_exit(&connp->conn_lock); 2952 return (-TOUTSTATE); 2953 } 2954 connp->conn_saddr_v6 = ipv6_all_zeros; 2955 connp->conn_bound_addr_v6 = ipv6_all_zeros; 2956 connp->conn_laddr_v6 = ipv6_all_zeros; 2957 connp->conn_mcbc_bind = B_FALSE; 2958 connp->conn_lport = 0; 2959 connp->conn_fport = 0; 2960 /* In case we were also connected */ 2961 connp->conn_faddr_v6 = ipv6_all_zeros; 2962 connp->conn_v6lastdst = ipv6_all_zeros; 2963 2964 icmp->icmp_state = TS_UNBND; 2965 2966 (void) icmp_build_hdr_template(connp, &connp->conn_saddr_v6, 2967 &connp->conn_faddr_v6, connp->conn_flowinfo); 2968 mutex_exit(&connp->conn_lock); 2969 2970 ip_unbind(connp); 2971 return (0); 2972 } 2973 2974 /* 2975 * This routine is called by icmp_wput to handle T_UNBIND_REQ messages. 2976 * After some error checking, the message is passed downstream to ip. 2977 */ 2978 static void 2979 icmp_tpi_unbind(queue_t *q, mblk_t *mp) 2980 { 2981 conn_t *connp = Q_TO_CONN(q); 2982 int error; 2983 2984 ASSERT(mp->b_cont == NULL); 2985 error = rawip_do_unbind(connp); 2986 if (error) { 2987 if (error < 0) { 2988 icmp_err_ack(q, mp, -error, 0); 2989 } else { 2990 icmp_err_ack(q, mp, 0, error); 2991 } 2992 return; 2993 } 2994 2995 /* 2996 * Convert mp into a T_OK_ACK 2997 */ 2998 2999 mp = mi_tpi_ok_ack_alloc(mp); 3000 3001 /* 3002 * should not happen in practice... T_OK_ACK is smaller than the 3003 * original message. 3004 */ 3005 ASSERT(mp != NULL); 3006 ASSERT(((struct T_ok_ack *)mp->b_rptr)->PRIM_type == T_OK_ACK); 3007 qreply(q, mp); 3008 } 3009 3010 /* 3011 * Process IPv4 packets that already include an IP header. 3012 * Used when IP_HDRINCL has been set (implicit for IPPROTO_RAW and 3013 * IPPROTO_IGMP). 3014 * In this case we ignore the address and any options in the T_UNITDATA_REQ. 3015 * 3016 * The packet is assumed to have a base (20 byte) IP header followed 3017 * by the upper-layer protocol. We include any IP_OPTIONS including a 3018 * CIPSO label but otherwise preserve the base IP header. 3019 */ 3020 static int 3021 icmp_output_hdrincl(conn_t *connp, mblk_t *mp, cred_t *cr, pid_t pid) 3022 { 3023 icmp_t *icmp = connp->conn_icmp; 3024 icmp_stack_t *is = icmp->icmp_is; 3025 ipha_t iphas; 3026 ipha_t *ipha; 3027 int ip_hdr_length; 3028 int tp_hdr_len; 3029 ip_xmit_attr_t *ixa; 3030 ip_pkt_t *ipp; 3031 in6_addr_t v6src; 3032 in6_addr_t v6dst; 3033 in6_addr_t v6nexthop; 3034 int error; 3035 boolean_t do_ipsec; 3036 3037 /* 3038 * We need an exclusive copy of conn_ixa since the included IP 3039 * header could have any destination. 3040 * That copy has no pointers hence we 3041 * need to set them up once we've parsed the ancillary data. 3042 */ 3043 ixa = conn_get_ixa_exclusive(connp); 3044 if (ixa == NULL) { 3045 BUMP_MIB(&is->is_rawip_mib, rawipOutErrors); 3046 freemsg(mp); 3047 return (ENOMEM); 3048 } 3049 ASSERT(cr != NULL); 3050 /* 3051 * Caller has a reference on cr; from db_credp or because we 3052 * are running in process context. 3053 */ 3054 ASSERT(!(ixa->ixa_free_flags & IXA_FREE_CRED)); 3055 ixa->ixa_cred = cr; 3056 ixa->ixa_cpid = pid; 3057 if (is_system_labeled()) { 3058 /* We need to restart with a label based on the cred */ 3059 ip_xmit_attr_restore_tsl(ixa, ixa->ixa_cred); 3060 } 3061 3062 /* In case previous destination was multicast or multirt */ 3063 ip_attr_newdst(ixa); 3064 3065 /* Get a copy of conn_xmit_ipp since the TX label might change it */ 3066 ipp = kmem_zalloc(sizeof (*ipp), KM_NOSLEEP); 3067 if (ipp == NULL) { 3068 ASSERT(!(ixa->ixa_free_flags & IXA_FREE_CRED)); 3069 ixa->ixa_cred = connp->conn_cred; /* Restore */ 3070 ixa->ixa_cpid = connp->conn_cpid; 3071 ixa_refrele(ixa); 3072 BUMP_MIB(&is->is_rawip_mib, rawipOutErrors); 3073 freemsg(mp); 3074 return (ENOMEM); 3075 } 3076 mutex_enter(&connp->conn_lock); 3077 error = ip_pkt_copy(&connp->conn_xmit_ipp, ipp, KM_NOSLEEP); 3078 mutex_exit(&connp->conn_lock); 3079 if (error != 0) { 3080 BUMP_MIB(&is->is_rawip_mib, rawipOutErrors); 3081 freemsg(mp); 3082 goto done; 3083 } 3084 3085 /* Sanity check length of packet */ 3086 ipha = (ipha_t *)mp->b_rptr; 3087 3088 ip_hdr_length = IP_SIMPLE_HDR_LENGTH; 3089 if ((mp->b_wptr - mp->b_rptr) < IP_SIMPLE_HDR_LENGTH) { 3090 if (!pullupmsg(mp, IP_SIMPLE_HDR_LENGTH)) { 3091 BUMP_MIB(&is->is_rawip_mib, rawipOutErrors); 3092 freemsg(mp); 3093 goto done; 3094 } 3095 ipha = (ipha_t *)mp->b_rptr; 3096 } 3097 ipha->ipha_version_and_hdr_length = 3098 (IP_VERSION<<4) | (ip_hdr_length>>2); 3099 3100 /* 3101 * We set IXAF_DONTFRAG if the application set DF which makes 3102 * IP not fragment. 3103 */ 3104 ipha->ipha_fragment_offset_and_flags &= htons(IPH_DF); 3105 if (ipha->ipha_fragment_offset_and_flags & htons(IPH_DF)) 3106 ixa->ixa_flags |= (IXAF_DONTFRAG | IXAF_PMTU_IPV4_DF); 3107 else 3108 ixa->ixa_flags &= ~(IXAF_DONTFRAG | IXAF_PMTU_IPV4_DF); 3109 3110 /* Even for multicast and broadcast we honor the apps ttl */ 3111 ixa->ixa_flags |= IXAF_NO_TTL_CHANGE; 3112 3113 /* 3114 * No source verification for non-local addresses 3115 */ 3116 if (ipha->ipha_src != INADDR_ANY && 3117 ip_laddr_verify_v4(ipha->ipha_src, ixa->ixa_zoneid, 3118 is->is_netstack->netstack_ip, B_FALSE) 3119 != IPVL_UNICAST_UP) { 3120 ixa->ixa_flags &= ~IXAF_VERIFY_SOURCE; 3121 } 3122 3123 if (ipha->ipha_dst == INADDR_ANY) 3124 ipha->ipha_dst = htonl(INADDR_LOOPBACK); 3125 3126 IN6_IPADDR_TO_V4MAPPED(ipha->ipha_src, &v6src); 3127 IN6_IPADDR_TO_V4MAPPED(ipha->ipha_dst, &v6dst); 3128 3129 /* Defer IPsec if it might need to look at ICMP type/code */ 3130 do_ipsec = ipha->ipha_protocol != IPPROTO_ICMP; 3131 ixa->ixa_flags |= IXAF_IS_IPV4; 3132 3133 ip_attr_nexthop(ipp, ixa, &v6dst, &v6nexthop); 3134 error = ip_attr_connect(connp, ixa, &v6src, &v6dst, &v6nexthop, 3135 connp->conn_fport, &v6src, NULL, IPDF_ALLOW_MCBC | IPDF_VERIFY_DST | 3136 (do_ipsec ? IPDF_IPSEC : 0)); 3137 switch (error) { 3138 case 0: 3139 break; 3140 case EADDRNOTAVAIL: 3141 /* 3142 * IXAF_VERIFY_SOURCE tells us to pick a better source. 3143 * Don't have the application see that errno 3144 */ 3145 error = ENETUNREACH; 3146 goto failed; 3147 case ENETDOWN: 3148 /* 3149 * Have !ipif_addr_ready address; drop packet silently 3150 * until we can get applications to not send until we 3151 * are ready. 3152 */ 3153 error = 0; 3154 goto failed; 3155 case EHOSTUNREACH: 3156 case ENETUNREACH: 3157 if (ixa->ixa_ire != NULL) { 3158 /* 3159 * Let conn_ip_output/ire_send_noroute return 3160 * the error and send any local ICMP error. 3161 */ 3162 error = 0; 3163 break; 3164 } 3165 /* FALLTHRU */ 3166 default: 3167 failed: 3168 freemsg(mp); 3169 BUMP_MIB(&is->is_rawip_mib, rawipOutErrors); 3170 goto done; 3171 } 3172 if (ipha->ipha_src == INADDR_ANY) 3173 IN6_V4MAPPED_TO_IPADDR(&v6src, ipha->ipha_src); 3174 3175 /* 3176 * We might be going to a different destination than last time, 3177 * thus check that TX allows the communication and compute any 3178 * needed label. 3179 * 3180 * TSOL Note: We have an exclusive ipp and ixa for this thread so we 3181 * don't have to worry about concurrent threads. 3182 */ 3183 if (is_system_labeled()) { 3184 /* 3185 * Check whether Trusted Solaris policy allows communication 3186 * with this host, and pretend that the destination is 3187 * unreachable if not. 3188 * Compute any needed label and place it in ipp_label_v4/v6. 3189 * 3190 * Later conn_build_hdr_template/conn_prepend_hdr takes 3191 * ipp_label_v4/v6 to form the packet. 3192 * 3193 * Tsol note: We have ipp structure local to this thread so 3194 * no locking is needed. 3195 */ 3196 error = conn_update_label(connp, ixa, &v6dst, ipp); 3197 if (error != 0) { 3198 freemsg(mp); 3199 BUMP_MIB(&is->is_rawip_mib, rawipOutErrors); 3200 goto done; 3201 } 3202 } 3203 3204 /* 3205 * Save away a copy of the IPv4 header the application passed down 3206 * and then prepend an IPv4 header complete with any IP options 3207 * including label. 3208 * We need a struct copy since icmp_prepend_hdr will reuse the available 3209 * space in the mblk. 3210 */ 3211 iphas = *ipha; 3212 mp->b_rptr += IP_SIMPLE_HDR_LENGTH; 3213 3214 mp = icmp_prepend_hdr(connp, ixa, ipp, &v6src, &v6dst, 0, mp, &error); 3215 if (mp == NULL) { 3216 BUMP_MIB(&is->is_rawip_mib, rawipOutErrors); 3217 ASSERT(error != 0); 3218 goto done; 3219 } 3220 if (ixa->ixa_pktlen > IP_MAXPACKET) { 3221 error = EMSGSIZE; 3222 BUMP_MIB(&is->is_rawip_mib, rawipOutErrors); 3223 freemsg(mp); 3224 goto done; 3225 } 3226 /* Restore key parts of the header that the application passed down */ 3227 ipha = (ipha_t *)mp->b_rptr; 3228 ipha->ipha_type_of_service = iphas.ipha_type_of_service; 3229 ipha->ipha_ident = iphas.ipha_ident; 3230 ipha->ipha_fragment_offset_and_flags = 3231 iphas.ipha_fragment_offset_and_flags; 3232 ipha->ipha_ttl = iphas.ipha_ttl; 3233 ipha->ipha_protocol = iphas.ipha_protocol; 3234 ipha->ipha_src = iphas.ipha_src; 3235 ipha->ipha_dst = iphas.ipha_dst; 3236 3237 ixa->ixa_protocol = ipha->ipha_protocol; 3238 3239 /* 3240 * Make sure that the IP header plus any transport header that is 3241 * checksumed by ip_output is in the first mblk. (ip_output assumes 3242 * that at least the checksum field is in the first mblk.) 3243 */ 3244 switch (ipha->ipha_protocol) { 3245 case IPPROTO_UDP: 3246 tp_hdr_len = 8; 3247 break; 3248 case IPPROTO_TCP: 3249 tp_hdr_len = 20; 3250 break; 3251 default: 3252 tp_hdr_len = 0; 3253 break; 3254 } 3255 ip_hdr_length = IPH_HDR_LENGTH(ipha); 3256 if (mp->b_wptr - mp->b_rptr < ip_hdr_length + tp_hdr_len) { 3257 if (!pullupmsg(mp, ip_hdr_length + tp_hdr_len)) { 3258 BUMP_MIB(&is->is_rawip_mib, rawipOutErrors); 3259 if (mp->b_cont == NULL) 3260 error = EINVAL; 3261 else 3262 error = ENOMEM; 3263 freemsg(mp); 3264 goto done; 3265 } 3266 } 3267 3268 if (!do_ipsec) { 3269 /* Policy might differ for different ICMP type/code */ 3270 if (ixa->ixa_ipsec_policy != NULL) { 3271 IPPOL_REFRELE(ixa->ixa_ipsec_policy); 3272 ixa->ixa_ipsec_policy = NULL; 3273 ixa->ixa_flags &= ~IXAF_IPSEC_SECURE; 3274 } 3275 mp = ip_output_attach_policy(mp, ipha, NULL, connp, ixa); 3276 if (mp == NULL) { 3277 BUMP_MIB(&is->is_rawip_mib, rawipOutErrors); 3278 error = EHOSTUNREACH; /* IPsec policy failure */ 3279 goto done; 3280 } 3281 } 3282 3283 /* We're done. Pass the packet to ip. */ 3284 BUMP_MIB(&is->is_rawip_mib, rawipOutDatagrams); 3285 3286 error = conn_ip_output(mp, ixa); 3287 /* No rawipOutErrors if an error since IP increases its error counter */ 3288 switch (error) { 3289 case 0: 3290 break; 3291 case EWOULDBLOCK: 3292 (void) ixa_check_drain_insert(connp, ixa); 3293 error = 0; 3294 break; 3295 case EADDRNOTAVAIL: 3296 /* 3297 * IXAF_VERIFY_SOURCE tells us to pick a better source. 3298 * Don't have the application see that errno 3299 */ 3300 error = ENETUNREACH; 3301 break; 3302 } 3303 done: 3304 ASSERT(!(ixa->ixa_free_flags & IXA_FREE_CRED)); 3305 ixa->ixa_cred = connp->conn_cred; /* Restore */ 3306 ixa->ixa_cpid = connp->conn_cpid; 3307 ixa_refrele(ixa); 3308 ip_pkt_free(ipp); 3309 kmem_free(ipp, sizeof (*ipp)); 3310 return (error); 3311 } 3312 3313 static mblk_t * 3314 icmp_output_attach_policy(mblk_t *mp, conn_t *connp, ip_xmit_attr_t *ixa) 3315 { 3316 ipha_t *ipha = NULL; 3317 ip6_t *ip6h = NULL; 3318 3319 if (ixa->ixa_flags & IXAF_IS_IPV4) 3320 ipha = (ipha_t *)mp->b_rptr; 3321 else 3322 ip6h = (ip6_t *)mp->b_rptr; 3323 3324 if (ixa->ixa_ipsec_policy != NULL) { 3325 IPPOL_REFRELE(ixa->ixa_ipsec_policy); 3326 ixa->ixa_ipsec_policy = NULL; 3327 ixa->ixa_flags &= ~IXAF_IPSEC_SECURE; 3328 } 3329 return (ip_output_attach_policy(mp, ipha, ip6h, connp, ixa)); 3330 } 3331 3332 /* 3333 * Handle T_UNITDATA_REQ with options. Both IPv4 and IPv6 3334 * Either tudr_mp or msg is set. If tudr_mp we take ancillary data from 3335 * the TPI options, otherwise we take them from msg_control. 3336 * If both sin and sin6 is set it is a connected socket and we use conn_faddr. 3337 * Always consumes mp; never consumes tudr_mp. 3338 */ 3339 static int 3340 icmp_output_ancillary(conn_t *connp, sin_t *sin, sin6_t *sin6, mblk_t *mp, 3341 mblk_t *tudr_mp, struct nmsghdr *msg, cred_t *cr, pid_t pid) 3342 { 3343 icmp_t *icmp = connp->conn_icmp; 3344 icmp_stack_t *is = icmp->icmp_is; 3345 int error; 3346 ip_xmit_attr_t *ixa; 3347 ip_pkt_t *ipp; 3348 in6_addr_t v6src; 3349 in6_addr_t v6dst; 3350 in6_addr_t v6nexthop; 3351 in_port_t dstport; 3352 uint32_t flowinfo; 3353 int is_absreq_failure = 0; 3354 conn_opt_arg_t coas, *coa; 3355 3356 ASSERT(tudr_mp != NULL || msg != NULL); 3357 3358 /* 3359 * Get ixa before checking state to handle a disconnect race. 3360 * 3361 * We need an exclusive copy of conn_ixa since the ancillary data 3362 * options might modify it. That copy has no pointers hence we 3363 * need to set them up once we've parsed the ancillary data. 3364 */ 3365 ixa = conn_get_ixa_exclusive(connp); 3366 if (ixa == NULL) { 3367 BUMP_MIB(&is->is_rawip_mib, rawipOutErrors); 3368 freemsg(mp); 3369 return (ENOMEM); 3370 } 3371 ASSERT(cr != NULL); 3372 ASSERT(!(ixa->ixa_free_flags & IXA_FREE_CRED)); 3373 ixa->ixa_cred = cr; 3374 ixa->ixa_cpid = pid; 3375 if (is_system_labeled()) { 3376 /* We need to restart with a label based on the cred */ 3377 ip_xmit_attr_restore_tsl(ixa, ixa->ixa_cred); 3378 } 3379 3380 /* In case previous destination was multicast or multirt */ 3381 ip_attr_newdst(ixa); 3382 3383 /* Get a copy of conn_xmit_ipp since the options might change it */ 3384 ipp = kmem_zalloc(sizeof (*ipp), KM_NOSLEEP); 3385 if (ipp == NULL) { 3386 ASSERT(!(ixa->ixa_free_flags & IXA_FREE_CRED)); 3387 ixa->ixa_cred = connp->conn_cred; /* Restore */ 3388 ixa->ixa_cpid = connp->conn_cpid; 3389 ixa_refrele(ixa); 3390 BUMP_MIB(&is->is_rawip_mib, rawipOutErrors); 3391 freemsg(mp); 3392 return (ENOMEM); 3393 } 3394 mutex_enter(&connp->conn_lock); 3395 error = ip_pkt_copy(&connp->conn_xmit_ipp, ipp, KM_NOSLEEP); 3396 mutex_exit(&connp->conn_lock); 3397 if (error != 0) { 3398 BUMP_MIB(&is->is_rawip_mib, rawipOutErrors); 3399 freemsg(mp); 3400 goto done; 3401 } 3402 3403 /* 3404 * Parse the options and update ixa and ipp as a result. 3405 */ 3406 3407 coa = &coas; 3408 coa->coa_connp = connp; 3409 coa->coa_ixa = ixa; 3410 coa->coa_ipp = ipp; 3411 coa->coa_ancillary = B_TRUE; 3412 coa->coa_changed = 0; 3413 3414 if (msg != NULL) { 3415 error = process_auxiliary_options(connp, msg->msg_control, 3416 msg->msg_controllen, coa, &icmp_opt_obj, icmp_opt_set, cr); 3417 } else { 3418 struct T_unitdata_req *tudr; 3419 3420 tudr = (struct T_unitdata_req *)tudr_mp->b_rptr; 3421 ASSERT(tudr->PRIM_type == T_UNITDATA_REQ); 3422 error = tpi_optcom_buf(connp->conn_wq, tudr_mp, 3423 &tudr->OPT_length, tudr->OPT_offset, cr, &icmp_opt_obj, 3424 coa, &is_absreq_failure); 3425 } 3426 if (error != 0) { 3427 /* 3428 * Note: No special action needed in this 3429 * module for "is_absreq_failure" 3430 */ 3431 freemsg(mp); 3432 BUMP_MIB(&is->is_rawip_mib, rawipOutErrors); 3433 goto done; 3434 } 3435 ASSERT(is_absreq_failure == 0); 3436 3437 mutex_enter(&connp->conn_lock); 3438 /* 3439 * If laddr is unspecified then we look at sin6_src_id. 3440 * We will give precedence to a source address set with IPV6_PKTINFO 3441 * (aka IPPF_ADDR) but that is handled in build_hdrs. However, we don't 3442 * want ip_attr_connect to select a source (since it can fail) when 3443 * IPV6_PKTINFO is specified. 3444 * If this doesn't result in a source address then we get a source 3445 * from ip_attr_connect() below. 3446 */ 3447 v6src = connp->conn_saddr_v6; 3448 if (sin != NULL) { 3449 IN6_IPADDR_TO_V4MAPPED(sin->sin_addr.s_addr, &v6dst); 3450 dstport = sin->sin_port; 3451 flowinfo = 0; 3452 ixa->ixa_flags &= ~IXAF_SCOPEID_SET; 3453 ixa->ixa_flags |= IXAF_IS_IPV4; 3454 } else if (sin6 != NULL) { 3455 boolean_t v4mapped; 3456 uint_t srcid; 3457 3458 v6dst = sin6->sin6_addr; 3459 dstport = sin6->sin6_port; 3460 flowinfo = sin6->sin6_flowinfo; 3461 srcid = sin6->__sin6_src_id; 3462 if (IN6_IS_ADDR_LINKSCOPE(&v6dst) && sin6->sin6_scope_id != 0) { 3463 ixa->ixa_scopeid = sin6->sin6_scope_id; 3464 ixa->ixa_flags |= IXAF_SCOPEID_SET; 3465 } else { 3466 ixa->ixa_flags &= ~IXAF_SCOPEID_SET; 3467 } 3468 v4mapped = IN6_IS_ADDR_V4MAPPED(&v6dst); 3469 if (v4mapped) 3470 ixa->ixa_flags |= IXAF_IS_IPV4; 3471 else 3472 ixa->ixa_flags &= ~IXAF_IS_IPV4; 3473 if (srcid != 0 && IN6_IS_ADDR_UNSPECIFIED(&v6src)) { 3474 if (!ip_srcid_find_id(srcid, &v6src, IPCL_ZONEID(connp), 3475 v4mapped, connp->conn_netstack)) { 3476 /* Mismatched v4mapped/v6 specified by srcid. */ 3477 mutex_exit(&connp->conn_lock); 3478 error = EADDRNOTAVAIL; 3479 goto failed; /* Does freemsg() and mib. */ 3480 } 3481 } 3482 } else { 3483 /* Connected case */ 3484 dstport = connp->conn_fport; 3485 v6dst = connp->conn_faddr_v6; 3486 flowinfo = connp->conn_flowinfo; 3487 } 3488 mutex_exit(&connp->conn_lock); 3489 /* Handle IP_PKTINFO/IPV6_PKTINFO setting source address. */ 3490 if (ipp->ipp_fields & IPPF_ADDR) { 3491 if (ixa->ixa_flags & IXAF_IS_IPV4) { 3492 if (IN6_IS_ADDR_V4MAPPED(&ipp->ipp_addr)) 3493 v6src = ipp->ipp_addr; 3494 } else { 3495 if (!IN6_IS_ADDR_V4MAPPED(&ipp->ipp_addr)) 3496 v6src = ipp->ipp_addr; 3497 } 3498 } 3499 /* 3500 * Allow source not assigned to the system 3501 * only if it is not a local addresses 3502 */ 3503 if (!V6_OR_V4_INADDR_ANY(v6src)) { 3504 ip_laddr_t laddr_type; 3505 3506 if (ixa->ixa_flags & IXAF_IS_IPV4) { 3507 ipaddr_t v4src; 3508 3509 IN6_V4MAPPED_TO_IPADDR(&v6src, v4src); 3510 laddr_type = ip_laddr_verify_v4(v4src, ixa->ixa_zoneid, 3511 is->is_netstack->netstack_ip, B_FALSE); 3512 } else { 3513 laddr_type = ip_laddr_verify_v6(&v6src, ixa->ixa_zoneid, 3514 is->is_netstack->netstack_ip, B_FALSE, B_FALSE); 3515 } 3516 if (laddr_type != IPVL_UNICAST_UP) 3517 ixa->ixa_flags &= ~IXAF_VERIFY_SOURCE; 3518 } 3519 3520 ip_attr_nexthop(ipp, ixa, &v6dst, &v6nexthop); 3521 error = ip_attr_connect(connp, ixa, &v6src, &v6dst, &v6nexthop, dstport, 3522 &v6src, NULL, IPDF_ALLOW_MCBC | IPDF_VERIFY_DST); 3523 3524 switch (error) { 3525 case 0: 3526 break; 3527 case EADDRNOTAVAIL: 3528 /* 3529 * IXAF_VERIFY_SOURCE tells us to pick a better source. 3530 * Don't have the application see that errno 3531 */ 3532 error = ENETUNREACH; 3533 goto failed; 3534 case ENETDOWN: 3535 /* 3536 * Have !ipif_addr_ready address; drop packet silently 3537 * until we can get applications to not send until we 3538 * are ready. 3539 */ 3540 error = 0; 3541 goto failed; 3542 case EHOSTUNREACH: 3543 case ENETUNREACH: 3544 if (ixa->ixa_ire != NULL) { 3545 /* 3546 * Let conn_ip_output/ire_send_noroute return 3547 * the error and send any local ICMP error. 3548 */ 3549 error = 0; 3550 break; 3551 } 3552 /* FALLTHRU */ 3553 default: 3554 failed: 3555 freemsg(mp); 3556 BUMP_MIB(&is->is_rawip_mib, rawipOutErrors); 3557 goto done; 3558 } 3559 3560 /* 3561 * We might be going to a different destination than last time, 3562 * thus check that TX allows the communication and compute any 3563 * needed label. 3564 * 3565 * TSOL Note: We have an exclusive ipp and ixa for this thread so we 3566 * don't have to worry about concurrent threads. 3567 */ 3568 if (is_system_labeled()) { 3569 /* 3570 * Check whether Trusted Solaris policy allows communication 3571 * with this host, and pretend that the destination is 3572 * unreachable if not. 3573 * Compute any needed label and place it in ipp_label_v4/v6. 3574 * 3575 * Later conn_build_hdr_template/conn_prepend_hdr takes 3576 * ipp_label_v4/v6 to form the packet. 3577 * 3578 * Tsol note: We have ipp structure local to this thread so 3579 * no locking is needed. 3580 */ 3581 error = conn_update_label(connp, ixa, &v6dst, ipp); 3582 if (error != 0) { 3583 freemsg(mp); 3584 BUMP_MIB(&is->is_rawip_mib, rawipOutErrors); 3585 goto done; 3586 } 3587 } 3588 mp = icmp_prepend_hdr(connp, ixa, ipp, &v6src, &v6dst, flowinfo, mp, 3589 &error); 3590 if (mp == NULL) { 3591 BUMP_MIB(&is->is_rawip_mib, rawipOutErrors); 3592 ASSERT(error != 0); 3593 goto done; 3594 } 3595 if (ixa->ixa_pktlen > IP_MAXPACKET) { 3596 error = EMSGSIZE; 3597 BUMP_MIB(&is->is_rawip_mib, rawipOutErrors); 3598 freemsg(mp); 3599 goto done; 3600 } 3601 3602 /* Policy might differ for different ICMP type/code */ 3603 mp = icmp_output_attach_policy(mp, connp, ixa); 3604 if (mp == NULL) { 3605 BUMP_MIB(&is->is_rawip_mib, rawipOutErrors); 3606 error = EHOSTUNREACH; /* IPsec policy failure */ 3607 goto done; 3608 } 3609 3610 /* We're done. Pass the packet to ip. */ 3611 BUMP_MIB(&is->is_rawip_mib, rawipOutDatagrams); 3612 3613 error = conn_ip_output(mp, ixa); 3614 if (!connp->conn_unspec_src) 3615 ixa->ixa_flags |= IXAF_VERIFY_SOURCE; 3616 /* No rawipOutErrors if an error since IP increases its error counter */ 3617 switch (error) { 3618 case 0: 3619 break; 3620 case EWOULDBLOCK: 3621 (void) ixa_check_drain_insert(connp, ixa); 3622 error = 0; 3623 break; 3624 case EADDRNOTAVAIL: 3625 /* 3626 * IXAF_VERIFY_SOURCE tells us to pick a better source. 3627 * Don't have the application see that errno 3628 */ 3629 error = ENETUNREACH; 3630 /* FALLTHRU */ 3631 default: 3632 mutex_enter(&connp->conn_lock); 3633 /* 3634 * Clear the source and v6lastdst so we call ip_attr_connect 3635 * for the next packet and try to pick a better source. 3636 */ 3637 if (connp->conn_mcbc_bind) 3638 connp->conn_saddr_v6 = ipv6_all_zeros; 3639 else 3640 connp->conn_saddr_v6 = connp->conn_bound_addr_v6; 3641 connp->conn_v6lastdst = ipv6_all_zeros; 3642 mutex_exit(&connp->conn_lock); 3643 break; 3644 } 3645 done: 3646 ASSERT(!(ixa->ixa_free_flags & IXA_FREE_CRED)); 3647 ixa->ixa_cred = connp->conn_cred; /* Restore */ 3648 ixa->ixa_cpid = connp->conn_cpid; 3649 ixa_refrele(ixa); 3650 ip_pkt_free(ipp); 3651 kmem_free(ipp, sizeof (*ipp)); 3652 return (error); 3653 } 3654 3655 /* 3656 * Handle sending an M_DATA for a connected socket. 3657 * Handles both IPv4 and IPv6. 3658 */ 3659 int 3660 icmp_output_connected(conn_t *connp, mblk_t *mp, cred_t *cr, pid_t pid) 3661 { 3662 icmp_t *icmp = connp->conn_icmp; 3663 icmp_stack_t *is = icmp->icmp_is; 3664 int error; 3665 ip_xmit_attr_t *ixa; 3666 boolean_t do_ipsec; 3667 3668 /* 3669 * If no other thread is using conn_ixa this just gets a reference to 3670 * conn_ixa. Otherwise we get a safe copy of conn_ixa. 3671 */ 3672 ixa = conn_get_ixa(connp, B_FALSE); 3673 if (ixa == NULL) { 3674 BUMP_MIB(&is->is_rawip_mib, rawipOutErrors); 3675 freemsg(mp); 3676 return (ENOMEM); 3677 } 3678 3679 ASSERT(cr != NULL); 3680 ASSERT(!(ixa->ixa_free_flags & IXA_FREE_CRED)); 3681 ixa->ixa_cred = cr; 3682 ixa->ixa_cpid = pid; 3683 3684 /* Defer IPsec if it might need to look at ICMP type/code */ 3685 switch (ixa->ixa_protocol) { 3686 case IPPROTO_ICMP: 3687 case IPPROTO_ICMPV6: 3688 do_ipsec = B_FALSE; 3689 break; 3690 default: 3691 do_ipsec = B_TRUE; 3692 } 3693 3694 mutex_enter(&connp->conn_lock); 3695 mp = icmp_prepend_header_template(connp, ixa, mp, 3696 &connp->conn_saddr_v6, connp->conn_flowinfo, &error); 3697 3698 if (mp == NULL) { 3699 ASSERT(error != 0); 3700 mutex_exit(&connp->conn_lock); 3701 ASSERT(!(ixa->ixa_free_flags & IXA_FREE_CRED)); 3702 ixa->ixa_cred = connp->conn_cred; /* Restore */ 3703 ixa->ixa_cpid = connp->conn_cpid; 3704 ixa_refrele(ixa); 3705 BUMP_MIB(&is->is_rawip_mib, rawipOutErrors); 3706 freemsg(mp); 3707 return (error); 3708 } 3709 3710 if (!do_ipsec) { 3711 /* Policy might differ for different ICMP type/code */ 3712 mp = icmp_output_attach_policy(mp, connp, ixa); 3713 if (mp == NULL) { 3714 mutex_exit(&connp->conn_lock); 3715 BUMP_MIB(&is->is_rawip_mib, rawipOutErrors); 3716 ASSERT(!(ixa->ixa_free_flags & IXA_FREE_CRED)); 3717 ixa->ixa_cred = connp->conn_cred; /* Restore */ 3718 ixa->ixa_cpid = connp->conn_cpid; 3719 ixa_refrele(ixa); 3720 return (EHOSTUNREACH); /* IPsec policy failure */ 3721 } 3722 } 3723 3724 /* 3725 * In case we got a safe copy of conn_ixa, or if opt_set made us a new 3726 * safe copy, then we need to fill in any pointers in it. 3727 */ 3728 if (ixa->ixa_ire == NULL) { 3729 in6_addr_t faddr, saddr; 3730 in6_addr_t nexthop; 3731 in_port_t fport; 3732 3733 saddr = connp->conn_saddr_v6; 3734 faddr = connp->conn_faddr_v6; 3735 fport = connp->conn_fport; 3736 ip_attr_nexthop(&connp->conn_xmit_ipp, ixa, &faddr, &nexthop); 3737 mutex_exit(&connp->conn_lock); 3738 3739 error = ip_attr_connect(connp, ixa, &saddr, &faddr, &nexthop, 3740 fport, NULL, NULL, IPDF_ALLOW_MCBC | IPDF_VERIFY_DST | 3741 (do_ipsec ? IPDF_IPSEC : 0)); 3742 switch (error) { 3743 case 0: 3744 break; 3745 case EADDRNOTAVAIL: 3746 /* 3747 * IXAF_VERIFY_SOURCE tells us to pick a better source. 3748 * Don't have the application see that errno 3749 */ 3750 error = ENETUNREACH; 3751 goto failed; 3752 case ENETDOWN: 3753 /* 3754 * Have !ipif_addr_ready address; drop packet silently 3755 * until we can get applications to not send until we 3756 * are ready. 3757 */ 3758 error = 0; 3759 goto failed; 3760 case EHOSTUNREACH: 3761 case ENETUNREACH: 3762 if (ixa->ixa_ire != NULL) { 3763 /* 3764 * Let conn_ip_output/ire_send_noroute return 3765 * the error and send any local ICMP error. 3766 */ 3767 error = 0; 3768 break; 3769 } 3770 /* FALLTHRU */ 3771 default: 3772 failed: 3773 ASSERT(!(ixa->ixa_free_flags & IXA_FREE_CRED)); 3774 ixa->ixa_cred = connp->conn_cred; /* Restore */ 3775 ixa->ixa_cpid = connp->conn_cpid; 3776 ixa_refrele(ixa); 3777 BUMP_MIB(&is->is_rawip_mib, rawipOutErrors); 3778 freemsg(mp); 3779 return (error); 3780 } 3781 } else { 3782 /* Done with conn_t */ 3783 mutex_exit(&connp->conn_lock); 3784 } 3785 3786 /* We're done. Pass the packet to ip. */ 3787 BUMP_MIB(&is->is_rawip_mib, rawipOutDatagrams); 3788 3789 error = conn_ip_output(mp, ixa); 3790 /* No rawipOutErrors if an error since IP increases its error counter */ 3791 switch (error) { 3792 case 0: 3793 break; 3794 case EWOULDBLOCK: 3795 (void) ixa_check_drain_insert(connp, ixa); 3796 error = 0; 3797 break; 3798 case EADDRNOTAVAIL: 3799 /* 3800 * IXAF_VERIFY_SOURCE tells us to pick a better source. 3801 * Don't have the application see that errno 3802 */ 3803 error = ENETUNREACH; 3804 break; 3805 } 3806 ASSERT(!(ixa->ixa_free_flags & IXA_FREE_CRED)); 3807 ixa->ixa_cred = connp->conn_cred; /* Restore */ 3808 ixa->ixa_cpid = connp->conn_cpid; 3809 ixa_refrele(ixa); 3810 return (error); 3811 } 3812 3813 /* 3814 * Handle sending an M_DATA to the last destination. 3815 * Handles both IPv4 and IPv6. 3816 * 3817 * NOTE: The caller must hold conn_lock and we drop it here. 3818 */ 3819 int 3820 icmp_output_lastdst(conn_t *connp, mblk_t *mp, cred_t *cr, pid_t pid, 3821 ip_xmit_attr_t *ixa) 3822 { 3823 icmp_t *icmp = connp->conn_icmp; 3824 icmp_stack_t *is = icmp->icmp_is; 3825 int error; 3826 boolean_t do_ipsec; 3827 3828 ASSERT(MUTEX_HELD(&connp->conn_lock)); 3829 ASSERT(ixa != NULL); 3830 3831 ASSERT(cr != NULL); 3832 ASSERT(!(ixa->ixa_free_flags & IXA_FREE_CRED)); 3833 ixa->ixa_cred = cr; 3834 ixa->ixa_cpid = pid; 3835 3836 /* Defer IPsec if it might need to look at ICMP type/code */ 3837 switch (ixa->ixa_protocol) { 3838 case IPPROTO_ICMP: 3839 case IPPROTO_ICMPV6: 3840 do_ipsec = B_FALSE; 3841 break; 3842 default: 3843 do_ipsec = B_TRUE; 3844 } 3845 3846 3847 mp = icmp_prepend_header_template(connp, ixa, mp, 3848 &connp->conn_v6lastsrc, connp->conn_lastflowinfo, &error); 3849 3850 if (mp == NULL) { 3851 ASSERT(error != 0); 3852 mutex_exit(&connp->conn_lock); 3853 ASSERT(!(ixa->ixa_free_flags & IXA_FREE_CRED)); 3854 ixa->ixa_cred = connp->conn_cred; /* Restore */ 3855 ixa->ixa_cpid = connp->conn_cpid; 3856 ixa_refrele(ixa); 3857 BUMP_MIB(&is->is_rawip_mib, rawipOutErrors); 3858 freemsg(mp); 3859 return (error); 3860 } 3861 3862 if (!do_ipsec) { 3863 /* Policy might differ for different ICMP type/code */ 3864 mp = icmp_output_attach_policy(mp, connp, ixa); 3865 if (mp == NULL) { 3866 mutex_exit(&connp->conn_lock); 3867 BUMP_MIB(&is->is_rawip_mib, rawipOutErrors); 3868 ASSERT(!(ixa->ixa_free_flags & IXA_FREE_CRED)); 3869 ixa->ixa_cred = connp->conn_cred; /* Restore */ 3870 ixa->ixa_cpid = connp->conn_cpid; 3871 ixa_refrele(ixa); 3872 return (EHOSTUNREACH); /* IPsec policy failure */ 3873 } 3874 } 3875 3876 /* 3877 * In case we got a safe copy of conn_ixa, or if opt_set made us a new 3878 * safe copy, then we need to fill in any pointers in it. 3879 */ 3880 if (ixa->ixa_ire == NULL) { 3881 in6_addr_t lastdst, lastsrc; 3882 in6_addr_t nexthop; 3883 in_port_t lastport; 3884 3885 lastsrc = connp->conn_v6lastsrc; 3886 lastdst = connp->conn_v6lastdst; 3887 lastport = connp->conn_lastdstport; 3888 ip_attr_nexthop(&connp->conn_xmit_ipp, ixa, &lastdst, &nexthop); 3889 mutex_exit(&connp->conn_lock); 3890 3891 error = ip_attr_connect(connp, ixa, &lastsrc, &lastdst, 3892 &nexthop, lastport, NULL, NULL, IPDF_ALLOW_MCBC | 3893 IPDF_VERIFY_DST | (do_ipsec ? IPDF_IPSEC : 0)); 3894 switch (error) { 3895 case 0: 3896 break; 3897 case EADDRNOTAVAIL: 3898 /* 3899 * IXAF_VERIFY_SOURCE tells us to pick a better source. 3900 * Don't have the application see that errno 3901 */ 3902 error = ENETUNREACH; 3903 goto failed; 3904 case ENETDOWN: 3905 /* 3906 * Have !ipif_addr_ready address; drop packet silently 3907 * until we can get applications to not send until we 3908 * are ready. 3909 */ 3910 error = 0; 3911 goto failed; 3912 case EHOSTUNREACH: 3913 case ENETUNREACH: 3914 if (ixa->ixa_ire != NULL) { 3915 /* 3916 * Let conn_ip_output/ire_send_noroute return 3917 * the error and send any local ICMP error. 3918 */ 3919 error = 0; 3920 break; 3921 } 3922 /* FALLTHRU */ 3923 default: 3924 failed: 3925 ASSERT(!(ixa->ixa_free_flags & IXA_FREE_CRED)); 3926 ixa->ixa_cred = connp->conn_cred; /* Restore */ 3927 ixa->ixa_cpid = connp->conn_cpid; 3928 ixa_refrele(ixa); 3929 BUMP_MIB(&is->is_rawip_mib, rawipOutErrors); 3930 freemsg(mp); 3931 return (error); 3932 } 3933 } else { 3934 /* Done with conn_t */ 3935 mutex_exit(&connp->conn_lock); 3936 } 3937 3938 /* We're done. Pass the packet to ip. */ 3939 BUMP_MIB(&is->is_rawip_mib, rawipOutDatagrams); 3940 error = conn_ip_output(mp, ixa); 3941 /* No rawipOutErrors if an error since IP increases its error counter */ 3942 switch (error) { 3943 case 0: 3944 break; 3945 case EWOULDBLOCK: 3946 (void) ixa_check_drain_insert(connp, ixa); 3947 error = 0; 3948 break; 3949 case EADDRNOTAVAIL: 3950 /* 3951 * IXAF_VERIFY_SOURCE tells us to pick a better source. 3952 * Don't have the application see that errno 3953 */ 3954 error = ENETUNREACH; 3955 /* FALLTHRU */ 3956 default: 3957 mutex_enter(&connp->conn_lock); 3958 /* 3959 * Clear the source and v6lastdst so we call ip_attr_connect 3960 * for the next packet and try to pick a better source. 3961 */ 3962 if (connp->conn_mcbc_bind) 3963 connp->conn_saddr_v6 = ipv6_all_zeros; 3964 else 3965 connp->conn_saddr_v6 = connp->conn_bound_addr_v6; 3966 connp->conn_v6lastdst = ipv6_all_zeros; 3967 mutex_exit(&connp->conn_lock); 3968 break; 3969 } 3970 ASSERT(!(ixa->ixa_free_flags & IXA_FREE_CRED)); 3971 ixa->ixa_cred = connp->conn_cred; /* Restore */ 3972 ixa->ixa_cpid = connp->conn_cpid; 3973 ixa_refrele(ixa); 3974 return (error); 3975 } 3976 3977 3978 /* 3979 * Prepend the header template and then fill in the source and 3980 * flowinfo. The caller needs to handle the destination address since 3981 * it's setting is different if rthdr or source route. 3982 * 3983 * Returns NULL is allocation failed or if the packet would exceed IP_MAXPACKET. 3984 * When it returns NULL it sets errorp. 3985 */ 3986 static mblk_t * 3987 icmp_prepend_header_template(conn_t *connp, ip_xmit_attr_t *ixa, mblk_t *mp, 3988 const in6_addr_t *v6src, uint32_t flowinfo, int *errorp) 3989 { 3990 icmp_t *icmp = connp->conn_icmp; 3991 icmp_stack_t *is = icmp->icmp_is; 3992 uint_t pktlen; 3993 uint_t copylen; 3994 uint8_t *iph; 3995 uint_t ip_hdr_length; 3996 uint32_t cksum; 3997 ip_pkt_t *ipp; 3998 3999 ASSERT(MUTEX_HELD(&connp->conn_lock)); 4000 4001 /* 4002 * Copy the header template. 4003 */ 4004 copylen = connp->conn_ht_iphc_len; 4005 pktlen = copylen + msgdsize(mp); 4006 if (pktlen > IP_MAXPACKET) { 4007 freemsg(mp); 4008 *errorp = EMSGSIZE; 4009 return (NULL); 4010 } 4011 ixa->ixa_pktlen = pktlen; 4012 4013 /* check/fix buffer config, setup pointers into it */ 4014 iph = mp->b_rptr - copylen; 4015 if (DB_REF(mp) != 1 || iph < DB_BASE(mp) || !OK_32PTR(iph)) { 4016 mblk_t *mp1; 4017 4018 mp1 = allocb(copylen + is->is_wroff_extra, BPRI_MED); 4019 if (mp1 == NULL) { 4020 freemsg(mp); 4021 *errorp = ENOMEM; 4022 return (NULL); 4023 } 4024 mp1->b_wptr = DB_LIM(mp1); 4025 mp1->b_cont = mp; 4026 mp = mp1; 4027 iph = (mp->b_wptr - copylen); 4028 } 4029 mp->b_rptr = iph; 4030 bcopy(connp->conn_ht_iphc, iph, copylen); 4031 ip_hdr_length = (uint_t)(connp->conn_ht_ulp - connp->conn_ht_iphc); 4032 4033 ixa->ixa_ip_hdr_length = ip_hdr_length; 4034 4035 /* 4036 * Prepare for ICMPv6 checksum done in IP. 4037 * 4038 * icmp_build_hdr_template has already massaged any routing header 4039 * and placed the result in conn_sum. 4040 * 4041 * We make it easy for IP to include our pseudo header 4042 * by putting our length (and any routing header adjustment) 4043 * in the ICMPv6 checksum field. 4044 */ 4045 cksum = pktlen - ip_hdr_length; 4046 4047 cksum += connp->conn_sum; 4048 cksum = (cksum >> 16) + (cksum & 0xFFFF); 4049 ASSERT(cksum < 0x10000); 4050 4051 ipp = &connp->conn_xmit_ipp; 4052 if (ixa->ixa_flags & IXAF_IS_IPV4) { 4053 ipha_t *ipha = (ipha_t *)iph; 4054 4055 ipha->ipha_length = htons((uint16_t)pktlen); 4056 4057 /* if IP_PKTINFO specified an addres it wins over bind() */ 4058 if ((ipp->ipp_fields & IPPF_ADDR) && 4059 IN6_IS_ADDR_V4MAPPED(&ipp->ipp_addr)) { 4060 ASSERT(ipp->ipp_addr_v4 != INADDR_ANY); 4061 ipha->ipha_src = ipp->ipp_addr_v4; 4062 } else { 4063 IN6_V4MAPPED_TO_IPADDR(v6src, ipha->ipha_src); 4064 } 4065 } else { 4066 ip6_t *ip6h = (ip6_t *)iph; 4067 uint_t cksum_offset = 0; 4068 4069 ip6h->ip6_plen = htons((uint16_t)(pktlen - IPV6_HDR_LEN)); 4070 4071 /* if IP_PKTINFO specified an addres it wins over bind() */ 4072 if ((ipp->ipp_fields & IPPF_ADDR) && 4073 !IN6_IS_ADDR_V4MAPPED(&ipp->ipp_addr)) { 4074 ASSERT(!IN6_IS_ADDR_UNSPECIFIED(&ipp->ipp_addr)); 4075 ip6h->ip6_src = ipp->ipp_addr; 4076 } else { 4077 ip6h->ip6_src = *v6src; 4078 } 4079 ip6h->ip6_vcf = 4080 (IPV6_DEFAULT_VERS_AND_FLOW & IPV6_VERS_AND_FLOW_MASK) | 4081 (flowinfo & ~IPV6_VERS_AND_FLOW_MASK); 4082 if (ipp->ipp_fields & IPPF_TCLASS) { 4083 /* Overrides the class part of flowinfo */ 4084 ip6h->ip6_vcf = IPV6_TCLASS_FLOW(ip6h->ip6_vcf, 4085 ipp->ipp_tclass); 4086 } 4087 4088 if (ixa->ixa_flags & IXAF_SET_ULP_CKSUM) { 4089 if (connp->conn_proto == IPPROTO_ICMPV6) { 4090 cksum_offset = ixa->ixa_ip_hdr_length + 4091 offsetof(icmp6_t, icmp6_cksum); 4092 } else if (ixa->ixa_flags & IXAF_SET_RAW_CKSUM) { 4093 cksum_offset = ixa->ixa_ip_hdr_length + 4094 ixa->ixa_raw_cksum_offset; 4095 } 4096 } 4097 if (cksum_offset != 0) { 4098 uint16_t *ptr; 4099 4100 /* Make sure the checksum fits in the first mblk */ 4101 if (cksum_offset + sizeof (short) > MBLKL(mp)) { 4102 mblk_t *mp1; 4103 4104 mp1 = msgpullup(mp, 4105 cksum_offset + sizeof (short)); 4106 freemsg(mp); 4107 if (mp1 == NULL) { 4108 *errorp = ENOMEM; 4109 return (NULL); 4110 } 4111 mp = mp1; 4112 iph = mp->b_rptr; 4113 ip6h = (ip6_t *)iph; 4114 } 4115 ptr = (uint16_t *)(mp->b_rptr + cksum_offset); 4116 *ptr = htons(cksum); 4117 } 4118 } 4119 4120 return (mp); 4121 } 4122 4123 /* 4124 * This routine handles all messages passed downstream. It either 4125 * consumes the message or passes it downstream; it never queues a 4126 * a message. 4127 */ 4128 int 4129 icmp_wput(queue_t *q, mblk_t *mp) 4130 { 4131 sin6_t *sin6; 4132 sin_t *sin = NULL; 4133 uint_t srcid; 4134 conn_t *connp = Q_TO_CONN(q); 4135 icmp_t *icmp = connp->conn_icmp; 4136 int error = 0; 4137 struct sockaddr *addr = NULL; 4138 socklen_t addrlen; 4139 icmp_stack_t *is = icmp->icmp_is; 4140 struct T_unitdata_req *tudr; 4141 mblk_t *data_mp; 4142 cred_t *cr; 4143 pid_t pid; 4144 4145 /* 4146 * We directly handle several cases here: T_UNITDATA_REQ message 4147 * coming down as M_PROTO/M_PCPROTO and M_DATA messages for connected 4148 * socket. 4149 */ 4150 switch (DB_TYPE(mp)) { 4151 case M_DATA: 4152 /* sockfs never sends down M_DATA */ 4153 BUMP_MIB(&is->is_rawip_mib, rawipOutErrors); 4154 freemsg(mp); 4155 return (0); 4156 4157 case M_PROTO: 4158 case M_PCPROTO: 4159 tudr = (struct T_unitdata_req *)mp->b_rptr; 4160 if (MBLKL(mp) < sizeof (*tudr) || 4161 ((t_primp_t)mp->b_rptr)->type != T_UNITDATA_REQ) { 4162 icmp_wput_other(q, mp); 4163 return (0); 4164 } 4165 break; 4166 4167 default: 4168 icmp_wput_other(q, mp); 4169 return (0); 4170 } 4171 4172 /* Handle valid T_UNITDATA_REQ here */ 4173 data_mp = mp->b_cont; 4174 if (data_mp == NULL) { 4175 error = EPROTO; 4176 goto ud_error2; 4177 } 4178 mp->b_cont = NULL; 4179 4180 if (!MBLKIN(mp, 0, tudr->DEST_offset + tudr->DEST_length)) { 4181 error = EADDRNOTAVAIL; 4182 goto ud_error2; 4183 } 4184 4185 /* 4186 * All Solaris components should pass a db_credp 4187 * for this message, hence we ASSERT. 4188 * On production kernels we return an error to be robust against 4189 * random streams modules sitting on top of us. 4190 */ 4191 cr = msg_getcred(mp, &pid); 4192 ASSERT(cr != NULL); 4193 if (cr == NULL) { 4194 error = EINVAL; 4195 goto ud_error2; 4196 } 4197 4198 /* 4199 * If a port has not been bound to the stream, fail. 4200 * This is not a problem when sockfs is directly 4201 * above us, because it will ensure that the socket 4202 * is first bound before allowing data to be sent. 4203 */ 4204 if (icmp->icmp_state == TS_UNBND) { 4205 error = EPROTO; 4206 goto ud_error2; 4207 } 4208 addr = (struct sockaddr *)&mp->b_rptr[tudr->DEST_offset]; 4209 addrlen = tudr->DEST_length; 4210 4211 switch (connp->conn_family) { 4212 case AF_INET6: 4213 sin6 = (sin6_t *)addr; 4214 if (!OK_32PTR((char *)sin6) || (addrlen != sizeof (sin6_t)) || 4215 (sin6->sin6_family != AF_INET6)) { 4216 error = EADDRNOTAVAIL; 4217 goto ud_error2; 4218 } 4219 4220 /* No support for mapped addresses on raw sockets */ 4221 if (IN6_IS_ADDR_V4MAPPED(&sin6->sin6_addr)) { 4222 error = EADDRNOTAVAIL; 4223 goto ud_error2; 4224 } 4225 srcid = sin6->__sin6_src_id; 4226 4227 /* 4228 * If the local address is a mapped address return 4229 * an error. 4230 * It would be possible to send an IPv6 packet but the 4231 * response would never make it back to the application 4232 * since it is bound to a mapped address. 4233 */ 4234 if (IN6_IS_ADDR_V4MAPPED(&connp->conn_saddr_v6)) { 4235 error = EADDRNOTAVAIL; 4236 goto ud_error2; 4237 } 4238 4239 if (IN6_IS_ADDR_UNSPECIFIED(&sin6->sin6_addr)) 4240 sin6->sin6_addr = ipv6_loopback; 4241 4242 if (tudr->OPT_length != 0) { 4243 /* 4244 * If we are connected then the destination needs to be 4245 * the same as the connected one. 4246 */ 4247 if (icmp->icmp_state == TS_DATA_XFER && 4248 !conn_same_as_last_v6(connp, sin6)) { 4249 error = EISCONN; 4250 goto ud_error2; 4251 } 4252 error = icmp_output_ancillary(connp, NULL, sin6, 4253 data_mp, mp, NULL, cr, pid); 4254 } else { 4255 ip_xmit_attr_t *ixa; 4256 4257 /* 4258 * We have to allocate an ip_xmit_attr_t before we grab 4259 * conn_lock and we need to hold conn_lock once we've 4260 * checked conn_same_as_last_v6 to handle concurrent 4261 * send* calls on a socket. 4262 */ 4263 ixa = conn_get_ixa(connp, B_FALSE); 4264 if (ixa == NULL) { 4265 error = ENOMEM; 4266 goto ud_error2; 4267 } 4268 mutex_enter(&connp->conn_lock); 4269 4270 if (conn_same_as_last_v6(connp, sin6) && 4271 connp->conn_lastsrcid == srcid && 4272 ipsec_outbound_policy_current(ixa)) { 4273 /* icmp_output_lastdst drops conn_lock */ 4274 error = icmp_output_lastdst(connp, data_mp, cr, 4275 pid, ixa); 4276 } else { 4277 /* icmp_output_newdst drops conn_lock */ 4278 error = icmp_output_newdst(connp, data_mp, NULL, 4279 sin6, cr, pid, ixa); 4280 } 4281 ASSERT(MUTEX_NOT_HELD(&connp->conn_lock)); 4282 } 4283 if (error == 0) { 4284 freeb(mp); 4285 return (0); 4286 } 4287 break; 4288 4289 case AF_INET: 4290 sin = (sin_t *)addr; 4291 if ((!OK_32PTR((char *)sin) || addrlen != sizeof (sin_t)) || 4292 (sin->sin_family != AF_INET)) { 4293 error = EADDRNOTAVAIL; 4294 goto ud_error2; 4295 } 4296 if (sin->sin_addr.s_addr == INADDR_ANY) 4297 sin->sin_addr.s_addr = htonl(INADDR_LOOPBACK); 4298 4299 /* Protocol 255 contains full IP headers */ 4300 /* Read without holding lock */ 4301 if (icmp->icmp_hdrincl) { 4302 if (MBLKL(data_mp) < IP_SIMPLE_HDR_LENGTH) { 4303 if (!pullupmsg(data_mp, IP_SIMPLE_HDR_LENGTH)) { 4304 error = EINVAL; 4305 goto ud_error2; 4306 } 4307 } 4308 error = icmp_output_hdrincl(connp, data_mp, cr, pid); 4309 if (error == 0) { 4310 freeb(mp); 4311 return (0); 4312 } 4313 /* data_mp consumed above */ 4314 data_mp = NULL; 4315 goto ud_error2; 4316 } 4317 4318 if (tudr->OPT_length != 0) { 4319 /* 4320 * If we are connected then the destination needs to be 4321 * the same as the connected one. 4322 */ 4323 if (icmp->icmp_state == TS_DATA_XFER && 4324 !conn_same_as_last_v4(connp, sin)) { 4325 error = EISCONN; 4326 goto ud_error2; 4327 } 4328 error = icmp_output_ancillary(connp, sin, NULL, 4329 data_mp, mp, NULL, cr, pid); 4330 } else { 4331 ip_xmit_attr_t *ixa; 4332 4333 /* 4334 * We have to allocate an ip_xmit_attr_t before we grab 4335 * conn_lock and we need to hold conn_lock once we've 4336 * checked conn_same_as_last_v4 to handle concurrent 4337 * send* calls on a socket. 4338 */ 4339 ixa = conn_get_ixa(connp, B_FALSE); 4340 if (ixa == NULL) { 4341 error = ENOMEM; 4342 goto ud_error2; 4343 } 4344 mutex_enter(&connp->conn_lock); 4345 4346 if (conn_same_as_last_v4(connp, sin) && 4347 ipsec_outbound_policy_current(ixa)) { 4348 /* icmp_output_lastdst drops conn_lock */ 4349 error = icmp_output_lastdst(connp, data_mp, cr, 4350 pid, ixa); 4351 } else { 4352 /* icmp_output_newdst drops conn_lock */ 4353 error = icmp_output_newdst(connp, data_mp, sin, 4354 NULL, cr, pid, ixa); 4355 } 4356 ASSERT(MUTEX_NOT_HELD(&connp->conn_lock)); 4357 } 4358 if (error == 0) { 4359 freeb(mp); 4360 return (0); 4361 } 4362 break; 4363 } 4364 ASSERT(mp != NULL); 4365 /* mp is freed by the following routine */ 4366 icmp_ud_err(q, mp, (t_scalar_t)error); 4367 return (0); 4368 4369 ud_error2: 4370 BUMP_MIB(&is->is_rawip_mib, rawipOutErrors); 4371 freemsg(data_mp); 4372 ASSERT(mp != NULL); 4373 /* mp is freed by the following routine */ 4374 icmp_ud_err(q, mp, (t_scalar_t)error); 4375 return (0); 4376 } 4377 4378 /* 4379 * Handle the case of the IP address or flow label being different 4380 * for both IPv4 and IPv6. 4381 * 4382 * NOTE: The caller must hold conn_lock and we drop it here. 4383 */ 4384 static int 4385 icmp_output_newdst(conn_t *connp, mblk_t *data_mp, sin_t *sin, sin6_t *sin6, 4386 cred_t *cr, pid_t pid, ip_xmit_attr_t *ixa) 4387 { 4388 icmp_t *icmp = connp->conn_icmp; 4389 icmp_stack_t *is = icmp->icmp_is; 4390 int error; 4391 ip_xmit_attr_t *oldixa; 4392 boolean_t do_ipsec; 4393 uint_t srcid; 4394 uint32_t flowinfo; 4395 in6_addr_t v6src; 4396 in6_addr_t v6dst; 4397 in6_addr_t v6nexthop; 4398 in_port_t dstport; 4399 4400 ASSERT(MUTEX_HELD(&connp->conn_lock)); 4401 ASSERT(ixa != NULL); 4402 4403 /* 4404 * We hold conn_lock across all the use and modifications of 4405 * the conn_lastdst, conn_ixa, and conn_xmit_ipp to ensure that they 4406 * stay consistent. 4407 */ 4408 4409 ASSERT(cr != NULL); 4410 ASSERT(!(ixa->ixa_free_flags & IXA_FREE_CRED)); 4411 ixa->ixa_cred = cr; 4412 ixa->ixa_cpid = pid; 4413 if (is_system_labeled()) { 4414 /* We need to restart with a label based on the cred */ 4415 ip_xmit_attr_restore_tsl(ixa, ixa->ixa_cred); 4416 } 4417 /* 4418 * If we are connected then the destination needs to be the 4419 * same as the connected one, which is not the case here since we 4420 * checked for that above. 4421 */ 4422 if (icmp->icmp_state == TS_DATA_XFER) { 4423 mutex_exit(&connp->conn_lock); 4424 error = EISCONN; 4425 goto ud_error; 4426 } 4427 4428 /* 4429 * Before we modify the ixa at all, invalidate our most recent address 4430 * to assure that any subsequent call to conn_same_as_last_v6() will 4431 * not indicate a match: any thread that picks up conn_lock after we 4432 * drop it (but before we pick it up again and properly set the most 4433 * recent address) must not associate the ixa with the (now old) last 4434 * address. 4435 */ 4436 connp->conn_v6lastdst = ipv6_all_zeros; 4437 4438 /* In case previous destination was multicast or multirt */ 4439 ip_attr_newdst(ixa); 4440 4441 /* 4442 * If laddr is unspecified then we look at sin6_src_id. 4443 * We will give precedence to a source address set with IPV6_PKTINFO 4444 * (aka IPPF_ADDR) but that is handled in build_hdrs. However, we don't 4445 * want ip_attr_connect to select a source (since it can fail) when 4446 * IPV6_PKTINFO is specified. 4447 * If this doesn't result in a source address then we get a source 4448 * from ip_attr_connect() below. 4449 */ 4450 v6src = connp->conn_saddr_v6; 4451 if (sin != NULL) { 4452 IN6_IPADDR_TO_V4MAPPED(sin->sin_addr.s_addr, &v6dst); 4453 dstport = sin->sin_port; 4454 flowinfo = 0; 4455 /* Don't bother with ip_srcid_find_id(), but indicate anyway. */ 4456 srcid = 0; 4457 ixa->ixa_flags &= ~IXAF_SCOPEID_SET; 4458 ixa->ixa_flags |= IXAF_IS_IPV4; 4459 } else { 4460 boolean_t v4mapped; 4461 4462 v6dst = sin6->sin6_addr; 4463 dstport = sin6->sin6_port; 4464 flowinfo = sin6->sin6_flowinfo; 4465 srcid = sin6->__sin6_src_id; 4466 if (IN6_IS_ADDR_LINKSCOPE(&v6dst) && sin6->sin6_scope_id != 0) { 4467 ixa->ixa_scopeid = sin6->sin6_scope_id; 4468 ixa->ixa_flags |= IXAF_SCOPEID_SET; 4469 } else { 4470 ixa->ixa_flags &= ~IXAF_SCOPEID_SET; 4471 } 4472 v4mapped = IN6_IS_ADDR_V4MAPPED(&v6dst); 4473 if (v4mapped) 4474 ixa->ixa_flags |= IXAF_IS_IPV4; 4475 else 4476 ixa->ixa_flags &= ~IXAF_IS_IPV4; 4477 if (srcid != 0 && IN6_IS_ADDR_UNSPECIFIED(&v6src)) { 4478 if (!ip_srcid_find_id(srcid, &v6src, IPCL_ZONEID(connp), 4479 v4mapped, connp->conn_netstack)) { 4480 /* Mismatched v4mapped/v6 specified by srcid. */ 4481 mutex_exit(&connp->conn_lock); 4482 error = EADDRNOTAVAIL; 4483 goto ud_error; 4484 } 4485 } 4486 } 4487 /* Handle IP_PKTINFO/IPV6_PKTINFO setting source address. */ 4488 if (connp->conn_xmit_ipp.ipp_fields & IPPF_ADDR) { 4489 ip_pkt_t *ipp = &connp->conn_xmit_ipp; 4490 4491 if (ixa->ixa_flags & IXAF_IS_IPV4) { 4492 if (IN6_IS_ADDR_V4MAPPED(&ipp->ipp_addr)) 4493 v6src = ipp->ipp_addr; 4494 } else { 4495 if (!IN6_IS_ADDR_V4MAPPED(&ipp->ipp_addr)) 4496 v6src = ipp->ipp_addr; 4497 } 4498 } 4499 4500 /* Defer IPsec if it might need to look at ICMP type/code */ 4501 switch (ixa->ixa_protocol) { 4502 case IPPROTO_ICMP: 4503 case IPPROTO_ICMPV6: 4504 do_ipsec = B_FALSE; 4505 break; 4506 default: 4507 do_ipsec = B_TRUE; 4508 } 4509 4510 ip_attr_nexthop(&connp->conn_xmit_ipp, ixa, &v6dst, &v6nexthop); 4511 mutex_exit(&connp->conn_lock); 4512 4513 error = ip_attr_connect(connp, ixa, &v6src, &v6dst, &v6nexthop, dstport, 4514 &v6src, NULL, IPDF_ALLOW_MCBC | IPDF_VERIFY_DST | 4515 (do_ipsec ? IPDF_IPSEC : 0)); 4516 switch (error) { 4517 case 0: 4518 break; 4519 case EADDRNOTAVAIL: 4520 /* 4521 * IXAF_VERIFY_SOURCE tells us to pick a better source. 4522 * Don't have the application see that errno 4523 */ 4524 error = ENETUNREACH; 4525 goto failed; 4526 case ENETDOWN: 4527 /* 4528 * Have !ipif_addr_ready address; drop packet silently 4529 * until we can get applications to not send until we 4530 * are ready. 4531 */ 4532 error = 0; 4533 goto failed; 4534 case EHOSTUNREACH: 4535 case ENETUNREACH: 4536 if (ixa->ixa_ire != NULL) { 4537 /* 4538 * Let conn_ip_output/ire_send_noroute return 4539 * the error and send any local ICMP error. 4540 */ 4541 error = 0; 4542 break; 4543 } 4544 /* FALLTHRU */ 4545 default: 4546 failed: 4547 goto ud_error; 4548 } 4549 4550 mutex_enter(&connp->conn_lock); 4551 /* 4552 * While we dropped the lock some other thread might have connected 4553 * this socket. If so we bail out with EISCONN to ensure that the 4554 * connecting thread is the one that updates conn_ixa, conn_ht_* 4555 * and conn_*last*. 4556 */ 4557 if (icmp->icmp_state == TS_DATA_XFER) { 4558 mutex_exit(&connp->conn_lock); 4559 error = EISCONN; 4560 goto ud_error; 4561 } 4562 4563 /* 4564 * We need to rebuild the headers if 4565 * - we are labeling packets (could be different for different 4566 * destinations) 4567 * - we have a source route (or routing header) since we need to 4568 * massage that to get the pseudo-header checksum 4569 * - a socket option with COA_HEADER_CHANGED has been set which 4570 * set conn_v6lastdst to zero. 4571 * 4572 * Otherwise the prepend function will just update the src, dst, 4573 * and flow label. 4574 */ 4575 if (is_system_labeled()) { 4576 /* TX MLP requires SCM_UCRED and don't have that here */ 4577 if (connp->conn_mlp_type != mlptSingle) { 4578 mutex_exit(&connp->conn_lock); 4579 error = ECONNREFUSED; 4580 goto ud_error; 4581 } 4582 /* 4583 * Check whether Trusted Solaris policy allows communication 4584 * with this host, and pretend that the destination is 4585 * unreachable if not. 4586 * Compute any needed label and place it in ipp_label_v4/v6. 4587 * 4588 * Later conn_build_hdr_template/conn_prepend_hdr takes 4589 * ipp_label_v4/v6 to form the packet. 4590 * 4591 * Tsol note: Since we hold conn_lock we know no other 4592 * thread manipulates conn_xmit_ipp. 4593 */ 4594 error = conn_update_label(connp, ixa, &v6dst, 4595 &connp->conn_xmit_ipp); 4596 if (error != 0) { 4597 mutex_exit(&connp->conn_lock); 4598 goto ud_error; 4599 } 4600 /* Rebuild the header template */ 4601 error = icmp_build_hdr_template(connp, &v6src, &v6dst, 4602 flowinfo); 4603 if (error != 0) { 4604 mutex_exit(&connp->conn_lock); 4605 goto ud_error; 4606 } 4607 } else if (connp->conn_xmit_ipp.ipp_fields & 4608 (IPPF_IPV4_OPTIONS|IPPF_RTHDR) || 4609 IN6_IS_ADDR_UNSPECIFIED(&connp->conn_v6lastdst)) { 4610 /* Rebuild the header template */ 4611 error = icmp_build_hdr_template(connp, &v6src, &v6dst, 4612 flowinfo); 4613 if (error != 0) { 4614 mutex_exit(&connp->conn_lock); 4615 goto ud_error; 4616 } 4617 } else { 4618 /* Simply update the destination address if no source route */ 4619 if (ixa->ixa_flags & IXAF_IS_IPV4) { 4620 ipha_t *ipha = (ipha_t *)connp->conn_ht_iphc; 4621 4622 IN6_V4MAPPED_TO_IPADDR(&v6dst, ipha->ipha_dst); 4623 if (ixa->ixa_flags & IXAF_PMTU_IPV4_DF) { 4624 ipha->ipha_fragment_offset_and_flags |= 4625 IPH_DF_HTONS; 4626 } else { 4627 ipha->ipha_fragment_offset_and_flags &= 4628 ~IPH_DF_HTONS; 4629 } 4630 } else { 4631 ip6_t *ip6h = (ip6_t *)connp->conn_ht_iphc; 4632 ip6h->ip6_dst = v6dst; 4633 } 4634 } 4635 4636 /* 4637 * Remember the dst etc which corresponds to the built header 4638 * template and conn_ixa. 4639 */ 4640 oldixa = conn_replace_ixa(connp, ixa); 4641 connp->conn_v6lastdst = v6dst; 4642 connp->conn_lastflowinfo = flowinfo; 4643 connp->conn_lastscopeid = ixa->ixa_scopeid; 4644 connp->conn_lastsrcid = srcid; 4645 /* Also remember a source to use together with lastdst */ 4646 connp->conn_v6lastsrc = v6src; 4647 4648 data_mp = icmp_prepend_header_template(connp, ixa, data_mp, &v6src, 4649 flowinfo, &error); 4650 4651 /* Done with conn_t */ 4652 mutex_exit(&connp->conn_lock); 4653 ixa_refrele(oldixa); 4654 4655 if (data_mp == NULL) { 4656 ASSERT(error != 0); 4657 goto ud_error; 4658 } 4659 4660 if (!do_ipsec) { 4661 /* Policy might differ for different ICMP type/code */ 4662 data_mp = icmp_output_attach_policy(data_mp, connp, ixa); 4663 if (data_mp == NULL) { 4664 BUMP_MIB(&is->is_rawip_mib, rawipOutErrors); 4665 error = EHOSTUNREACH; /* IPsec policy failure */ 4666 goto done; 4667 } 4668 } 4669 4670 /* We're done. Pass the packet to ip. */ 4671 BUMP_MIB(&is->is_rawip_mib, rawipOutDatagrams); 4672 4673 error = conn_ip_output(data_mp, ixa); 4674 /* No rawipOutErrors if an error since IP increases its error counter */ 4675 switch (error) { 4676 case 0: 4677 break; 4678 case EWOULDBLOCK: 4679 (void) ixa_check_drain_insert(connp, ixa); 4680 error = 0; 4681 break; 4682 case EADDRNOTAVAIL: 4683 /* 4684 * IXAF_VERIFY_SOURCE tells us to pick a better source. 4685 * Don't have the application see that errno 4686 */ 4687 error = ENETUNREACH; 4688 /* FALLTHRU */ 4689 default: 4690 mutex_enter(&connp->conn_lock); 4691 /* 4692 * Clear the source and v6lastdst so we call ip_attr_connect 4693 * for the next packet and try to pick a better source. 4694 */ 4695 if (connp->conn_mcbc_bind) 4696 connp->conn_saddr_v6 = ipv6_all_zeros; 4697 else 4698 connp->conn_saddr_v6 = connp->conn_bound_addr_v6; 4699 connp->conn_v6lastdst = ipv6_all_zeros; 4700 mutex_exit(&connp->conn_lock); 4701 break; 4702 } 4703 done: 4704 ASSERT(!(ixa->ixa_free_flags & IXA_FREE_CRED)); 4705 ixa->ixa_cred = connp->conn_cred; /* Restore */ 4706 ixa->ixa_cpid = connp->conn_cpid; 4707 ixa_refrele(ixa); 4708 return (error); 4709 4710 ud_error: 4711 ASSERT(!(ixa->ixa_free_flags & IXA_FREE_CRED)); 4712 ixa->ixa_cred = connp->conn_cred; /* Restore */ 4713 ixa->ixa_cpid = connp->conn_cpid; 4714 ixa_refrele(ixa); 4715 4716 BUMP_MIB(&is->is_rawip_mib, rawipOutErrors); 4717 freemsg(data_mp); 4718 return (error); 4719 } 4720 4721 /* ARGSUSED */ 4722 static int 4723 icmp_wput_fallback(queue_t *q, mblk_t *mp) 4724 { 4725 #ifdef DEBUG 4726 cmn_err(CE_CONT, "icmp_wput_fallback: Message during fallback \n"); 4727 #endif 4728 freemsg(mp); 4729 return (0); 4730 } 4731 4732 static void 4733 icmp_wput_other(queue_t *q, mblk_t *mp) 4734 { 4735 uchar_t *rptr = mp->b_rptr; 4736 struct iocblk *iocp; 4737 conn_t *connp = Q_TO_CONN(q); 4738 icmp_t *icmp = connp->conn_icmp; 4739 cred_t *cr; 4740 4741 switch (mp->b_datap->db_type) { 4742 case M_PROTO: 4743 case M_PCPROTO: 4744 if (mp->b_wptr - rptr < sizeof (t_scalar_t)) { 4745 /* 4746 * If the message does not contain a PRIM_type, 4747 * throw it away. 4748 */ 4749 freemsg(mp); 4750 return; 4751 } 4752 switch (((t_primp_t)rptr)->type) { 4753 case T_ADDR_REQ: 4754 icmp_addr_req(q, mp); 4755 return; 4756 case O_T_BIND_REQ: 4757 case T_BIND_REQ: 4758 icmp_tpi_bind(q, mp); 4759 return; 4760 case T_CONN_REQ: 4761 icmp_tpi_connect(q, mp); 4762 return; 4763 case T_CAPABILITY_REQ: 4764 icmp_capability_req(q, mp); 4765 return; 4766 case T_INFO_REQ: 4767 icmp_info_req(q, mp); 4768 return; 4769 case T_UNITDATA_REQ: 4770 /* 4771 * If a T_UNITDATA_REQ gets here, the address must 4772 * be bad. Valid T_UNITDATA_REQs are handled 4773 * in icmp_wput. 4774 */ 4775 icmp_ud_err(q, mp, EADDRNOTAVAIL); 4776 return; 4777 case T_UNBIND_REQ: 4778 icmp_tpi_unbind(q, mp); 4779 return; 4780 case T_SVR4_OPTMGMT_REQ: 4781 /* 4782 * All Solaris components should pass a db_credp 4783 * for this TPI message, hence we ASSERT. 4784 * But in case there is some other M_PROTO that looks 4785 * like a TPI message sent by some other kernel 4786 * component, we check and return an error. 4787 */ 4788 cr = msg_getcred(mp, NULL); 4789 ASSERT(cr != NULL); 4790 if (cr == NULL) { 4791 icmp_err_ack(q, mp, TSYSERR, EINVAL); 4792 return; 4793 } 4794 4795 if (!snmpcom_req(q, mp, icmp_snmp_set, ip_snmp_get, 4796 cr)) { 4797 svr4_optcom_req(q, mp, cr, &icmp_opt_obj); 4798 } 4799 return; 4800 4801 case T_OPTMGMT_REQ: 4802 /* 4803 * All Solaris components should pass a db_credp 4804 * for this TPI message, hence we ASSERT. 4805 * But in case there is some other M_PROTO that looks 4806 * like a TPI message sent by some other kernel 4807 * component, we check and return an error. 4808 */ 4809 cr = msg_getcred(mp, NULL); 4810 ASSERT(cr != NULL); 4811 if (cr == NULL) { 4812 icmp_err_ack(q, mp, TSYSERR, EINVAL); 4813 return; 4814 } 4815 tpi_optcom_req(q, mp, cr, &icmp_opt_obj); 4816 return; 4817 4818 case T_DISCON_REQ: 4819 icmp_tpi_disconnect(q, mp); 4820 return; 4821 4822 /* The following TPI message is not supported by icmp. */ 4823 case O_T_CONN_RES: 4824 case T_CONN_RES: 4825 icmp_err_ack(q, mp, TNOTSUPPORT, 0); 4826 return; 4827 4828 /* The following 3 TPI requests are illegal for icmp. */ 4829 case T_DATA_REQ: 4830 case T_EXDATA_REQ: 4831 case T_ORDREL_REQ: 4832 icmp_err_ack(q, mp, TNOTSUPPORT, 0); 4833 return; 4834 default: 4835 break; 4836 } 4837 break; 4838 case M_FLUSH: 4839 if (*rptr & FLUSHW) 4840 flushq(q, FLUSHDATA); 4841 break; 4842 case M_IOCTL: 4843 iocp = (struct iocblk *)mp->b_rptr; 4844 switch (iocp->ioc_cmd) { 4845 case TI_GETPEERNAME: 4846 if (icmp->icmp_state != TS_DATA_XFER) { 4847 /* 4848 * If a default destination address has not 4849 * been associated with the stream, then we 4850 * don't know the peer's name. 4851 */ 4852 iocp->ioc_error = ENOTCONN; 4853 iocp->ioc_count = 0; 4854 mp->b_datap->db_type = M_IOCACK; 4855 qreply(q, mp); 4856 return; 4857 } 4858 /* FALLTHRU */ 4859 case TI_GETMYNAME: 4860 /* 4861 * For TI_GETPEERNAME and TI_GETMYNAME, we first 4862 * need to copyin the user's strbuf structure. 4863 * Processing will continue in the M_IOCDATA case 4864 * below. 4865 */ 4866 mi_copyin(q, mp, NULL, 4867 SIZEOF_STRUCT(strbuf, iocp->ioc_flag)); 4868 return; 4869 default: 4870 break; 4871 } 4872 break; 4873 case M_IOCDATA: 4874 icmp_wput_iocdata(q, mp); 4875 return; 4876 default: 4877 /* Unrecognized messages are passed through without change. */ 4878 break; 4879 } 4880 ip_wput_nondata(q, mp); 4881 } 4882 4883 /* 4884 * icmp_wput_iocdata is called by icmp_wput_other to handle all M_IOCDATA 4885 * messages. 4886 */ 4887 static void 4888 icmp_wput_iocdata(queue_t *q, mblk_t *mp) 4889 { 4890 mblk_t *mp1; 4891 STRUCT_HANDLE(strbuf, sb); 4892 uint_t addrlen; 4893 conn_t *connp = Q_TO_CONN(q); 4894 icmp_t *icmp = connp->conn_icmp; 4895 4896 /* Make sure it is one of ours. */ 4897 switch (((struct iocblk *)mp->b_rptr)->ioc_cmd) { 4898 case TI_GETMYNAME: 4899 case TI_GETPEERNAME: 4900 break; 4901 default: 4902 ip_wput_nondata(q, mp); 4903 return; 4904 } 4905 4906 switch (mi_copy_state(q, mp, &mp1)) { 4907 case -1: 4908 return; 4909 case MI_COPY_CASE(MI_COPY_IN, 1): 4910 break; 4911 case MI_COPY_CASE(MI_COPY_OUT, 1): 4912 /* 4913 * The address has been copied out, so now 4914 * copyout the strbuf. 4915 */ 4916 mi_copyout(q, mp); 4917 return; 4918 case MI_COPY_CASE(MI_COPY_OUT, 2): 4919 /* 4920 * The address and strbuf have been copied out. 4921 * We're done, so just acknowledge the original 4922 * M_IOCTL. 4923 */ 4924 mi_copy_done(q, mp, 0); 4925 return; 4926 default: 4927 /* 4928 * Something strange has happened, so acknowledge 4929 * the original M_IOCTL with an EPROTO error. 4930 */ 4931 mi_copy_done(q, mp, EPROTO); 4932 return; 4933 } 4934 4935 /* 4936 * Now we have the strbuf structure for TI_GETMYNAME 4937 * and TI_GETPEERNAME. Next we copyout the requested 4938 * address and then we'll copyout the strbuf. 4939 */ 4940 STRUCT_SET_HANDLE(sb, ((struct iocblk *)mp->b_rptr)->ioc_flag, 4941 (void *)mp1->b_rptr); 4942 4943 if (connp->conn_family == AF_INET) 4944 addrlen = sizeof (sin_t); 4945 else 4946 addrlen = sizeof (sin6_t); 4947 4948 if (STRUCT_FGET(sb, maxlen) < addrlen) { 4949 mi_copy_done(q, mp, EINVAL); 4950 return; 4951 } 4952 switch (((struct iocblk *)mp->b_rptr)->ioc_cmd) { 4953 case TI_GETMYNAME: 4954 break; 4955 case TI_GETPEERNAME: 4956 if (icmp->icmp_state != TS_DATA_XFER) { 4957 mi_copy_done(q, mp, ENOTCONN); 4958 return; 4959 } 4960 break; 4961 default: 4962 mi_copy_done(q, mp, EPROTO); 4963 return; 4964 } 4965 mp1 = mi_copyout_alloc(q, mp, STRUCT_FGETP(sb, buf), addrlen, B_TRUE); 4966 if (!mp1) 4967 return; 4968 4969 STRUCT_FSET(sb, len, addrlen); 4970 switch (((struct iocblk *)mp->b_rptr)->ioc_cmd) { 4971 case TI_GETMYNAME: 4972 (void) conn_getsockname(connp, (struct sockaddr *)mp1->b_wptr, 4973 &addrlen); 4974 break; 4975 case TI_GETPEERNAME: 4976 (void) conn_getpeername(connp, (struct sockaddr *)mp1->b_wptr, 4977 &addrlen); 4978 break; 4979 } 4980 mp1->b_wptr += addrlen; 4981 /* Copy out the address */ 4982 mi_copyout(q, mp); 4983 } 4984 4985 void 4986 icmp_ddi_g_init(void) 4987 { 4988 icmp_max_optsize = optcom_max_optsize(icmp_opt_obj.odb_opt_des_arr, 4989 icmp_opt_obj.odb_opt_arr_cnt); 4990 4991 /* 4992 * We want to be informed each time a stack is created or 4993 * destroyed in the kernel, so we can maintain the 4994 * set of icmp_stack_t's. 4995 */ 4996 netstack_register(NS_ICMP, rawip_stack_init, NULL, rawip_stack_fini); 4997 } 4998 4999 void 5000 icmp_ddi_g_destroy(void) 5001 { 5002 netstack_unregister(NS_ICMP); 5003 } 5004 5005 #define INET_NAME "ip" 5006 5007 /* 5008 * Initialize the ICMP stack instance. 5009 */ 5010 static void * 5011 rawip_stack_init(netstackid_t stackid, netstack_t *ns) 5012 { 5013 icmp_stack_t *is; 5014 int error = 0; 5015 size_t arrsz; 5016 major_t major; 5017 5018 is = (icmp_stack_t *)kmem_zalloc(sizeof (*is), KM_SLEEP); 5019 is->is_netstack = ns; 5020 5021 arrsz = sizeof (icmp_propinfo_tbl); 5022 is->is_propinfo_tbl = (mod_prop_info_t *)kmem_alloc(arrsz, KM_SLEEP); 5023 bcopy(icmp_propinfo_tbl, is->is_propinfo_tbl, arrsz); 5024 5025 is->is_ksp = rawip_kstat_init(stackid); 5026 5027 major = mod_name_to_major(INET_NAME); 5028 error = ldi_ident_from_major(major, &is->is_ldi_ident); 5029 ASSERT(error == 0); 5030 return (is); 5031 } 5032 5033 /* 5034 * Free the ICMP stack instance. 5035 */ 5036 static void 5037 rawip_stack_fini(netstackid_t stackid, void *arg) 5038 { 5039 icmp_stack_t *is = (icmp_stack_t *)arg; 5040 5041 kmem_free(is->is_propinfo_tbl, sizeof (icmp_propinfo_tbl)); 5042 is->is_propinfo_tbl = NULL; 5043 5044 rawip_kstat_fini(stackid, is->is_ksp); 5045 is->is_ksp = NULL; 5046 ldi_ident_release(is->is_ldi_ident); 5047 kmem_free(is, sizeof (*is)); 5048 } 5049 5050 static void * 5051 rawip_kstat_init(netstackid_t stackid) 5052 { 5053 kstat_t *ksp; 5054 5055 rawip_named_kstat_t template = { 5056 { "inDatagrams", KSTAT_DATA_UINT32, 0 }, 5057 { "inCksumErrs", KSTAT_DATA_UINT32, 0 }, 5058 { "inErrors", KSTAT_DATA_UINT32, 0 }, 5059 { "outDatagrams", KSTAT_DATA_UINT32, 0 }, 5060 { "outErrors", KSTAT_DATA_UINT32, 0 }, 5061 }; 5062 5063 ksp = kstat_create_netstack("icmp", 0, "rawip", "mib2", 5064 KSTAT_TYPE_NAMED, NUM_OF_FIELDS(rawip_named_kstat_t), 0, stackid); 5065 if (ksp == NULL || ksp->ks_data == NULL) 5066 return (NULL); 5067 5068 bcopy(&template, ksp->ks_data, sizeof (template)); 5069 ksp->ks_update = rawip_kstat_update; 5070 ksp->ks_private = (void *)(uintptr_t)stackid; 5071 5072 kstat_install(ksp); 5073 return (ksp); 5074 } 5075 5076 static void 5077 rawip_kstat_fini(netstackid_t stackid, kstat_t *ksp) 5078 { 5079 if (ksp != NULL) { 5080 ASSERT(stackid == (netstackid_t)(uintptr_t)ksp->ks_private); 5081 kstat_delete_netstack(ksp, stackid); 5082 } 5083 } 5084 5085 static int 5086 rawip_kstat_update(kstat_t *ksp, int rw) 5087 { 5088 rawip_named_kstat_t *rawipkp; 5089 netstackid_t stackid = (netstackid_t)(uintptr_t)ksp->ks_private; 5090 netstack_t *ns; 5091 icmp_stack_t *is; 5092 5093 if (ksp->ks_data == NULL) 5094 return (EIO); 5095 5096 if (rw == KSTAT_WRITE) 5097 return (EACCES); 5098 5099 rawipkp = (rawip_named_kstat_t *)ksp->ks_data; 5100 5101 ns = netstack_find_by_stackid(stackid); 5102 if (ns == NULL) 5103 return (-1); 5104 is = ns->netstack_icmp; 5105 if (is == NULL) { 5106 netstack_rele(ns); 5107 return (-1); 5108 } 5109 rawipkp->inDatagrams.value.ui32 = is->is_rawip_mib.rawipInDatagrams; 5110 rawipkp->inCksumErrs.value.ui32 = is->is_rawip_mib.rawipInCksumErrs; 5111 rawipkp->inErrors.value.ui32 = is->is_rawip_mib.rawipInErrors; 5112 rawipkp->outDatagrams.value.ui32 = is->is_rawip_mib.rawipOutDatagrams; 5113 rawipkp->outErrors.value.ui32 = is->is_rawip_mib.rawipOutErrors; 5114 netstack_rele(ns); 5115 return (0); 5116 } 5117 5118 /* ARGSUSED */ 5119 int 5120 rawip_accept(sock_lower_handle_t lproto_handle, 5121 sock_lower_handle_t eproto_handle, sock_upper_handle_t sock_handle, 5122 cred_t *cr) 5123 { 5124 return (EOPNOTSUPP); 5125 } 5126 5127 /* ARGSUSED */ 5128 int 5129 rawip_bind(sock_lower_handle_t proto_handle, struct sockaddr *sa, 5130 socklen_t len, cred_t *cr) 5131 { 5132 conn_t *connp = (conn_t *)proto_handle; 5133 int error; 5134 5135 /* All Solaris components should pass a cred for this operation. */ 5136 ASSERT(cr != NULL); 5137 5138 /* Binding to a NULL address really means unbind */ 5139 if (sa == NULL) 5140 error = rawip_do_unbind(connp); 5141 else 5142 error = rawip_do_bind(connp, sa, len); 5143 5144 if (error < 0) { 5145 if (error == -TOUTSTATE) 5146 error = EINVAL; 5147 else 5148 error = proto_tlitosyserr(-error); 5149 } 5150 return (error); 5151 } 5152 5153 static int 5154 rawip_implicit_bind(conn_t *connp) 5155 { 5156 sin6_t sin6addr; 5157 sin_t *sin; 5158 sin6_t *sin6; 5159 socklen_t len; 5160 int error; 5161 5162 if (connp->conn_family == AF_INET) { 5163 len = sizeof (struct sockaddr_in); 5164 sin = (sin_t *)&sin6addr; 5165 *sin = sin_null; 5166 sin->sin_family = AF_INET; 5167 sin->sin_addr.s_addr = INADDR_ANY; 5168 } else { 5169 ASSERT(connp->conn_family == AF_INET6); 5170 len = sizeof (sin6_t); 5171 sin6 = (sin6_t *)&sin6addr; 5172 *sin6 = sin6_null; 5173 sin6->sin6_family = AF_INET6; 5174 V6_SET_ZERO(sin6->sin6_addr); 5175 } 5176 5177 error = rawip_do_bind(connp, (struct sockaddr *)&sin6addr, len); 5178 5179 return ((error < 0) ? proto_tlitosyserr(-error) : error); 5180 } 5181 5182 static int 5183 rawip_unbind(conn_t *connp) 5184 { 5185 int error; 5186 5187 error = rawip_do_unbind(connp); 5188 if (error < 0) { 5189 error = proto_tlitosyserr(-error); 5190 } 5191 return (error); 5192 } 5193 5194 /* ARGSUSED */ 5195 int 5196 rawip_listen(sock_lower_handle_t proto_handle, int backlog, cred_t *cr) 5197 { 5198 return (EOPNOTSUPP); 5199 } 5200 5201 int 5202 rawip_connect(sock_lower_handle_t proto_handle, const struct sockaddr *sa, 5203 socklen_t len, sock_connid_t *id, cred_t *cr) 5204 { 5205 conn_t *connp = (conn_t *)proto_handle; 5206 icmp_t *icmp = connp->conn_icmp; 5207 int error; 5208 boolean_t did_bind = B_FALSE; 5209 pid_t pid = curproc->p_pid; 5210 5211 /* All Solaris components should pass a cred for this operation. */ 5212 ASSERT(cr != NULL); 5213 5214 if (sa == NULL) { 5215 /* 5216 * Disconnect 5217 * Make sure we are connected 5218 */ 5219 if (icmp->icmp_state != TS_DATA_XFER) 5220 return (EINVAL); 5221 5222 error = icmp_disconnect(connp); 5223 return (error); 5224 } 5225 5226 error = proto_verify_ip_addr(connp->conn_family, sa, len); 5227 if (error != 0) 5228 return (error); 5229 5230 /* do an implicit bind if necessary */ 5231 if (icmp->icmp_state == TS_UNBND) { 5232 error = rawip_implicit_bind(connp); 5233 /* 5234 * We could be racing with an actual bind, in which case 5235 * we would see EPROTO. We cross our fingers and try 5236 * to connect. 5237 */ 5238 if (!(error == 0 || error == EPROTO)) 5239 return (error); 5240 did_bind = B_TRUE; 5241 } 5242 5243 /* 5244 * set SO_DGRAM_ERRIND 5245 */ 5246 connp->conn_dgram_errind = B_TRUE; 5247 5248 error = rawip_do_connect(connp, sa, len, cr, pid); 5249 if (error != 0 && did_bind) { 5250 int unbind_err; 5251 5252 unbind_err = rawip_unbind(connp); 5253 ASSERT(unbind_err == 0); 5254 } 5255 5256 if (error == 0) { 5257 *id = 0; 5258 (*connp->conn_upcalls->su_connected)(connp->conn_upper_handle, 5259 0, NULL, -1); 5260 } else if (error < 0) { 5261 error = proto_tlitosyserr(-error); 5262 } 5263 return (error); 5264 } 5265 5266 /* ARGSUSED2 */ 5267 int 5268 rawip_fallback(sock_lower_handle_t proto_handle, queue_t *q, 5269 boolean_t direct_sockfs, so_proto_quiesced_cb_t quiesced_cb, 5270 sock_quiesce_arg_t *arg) 5271 { 5272 conn_t *connp = (conn_t *)proto_handle; 5273 icmp_t *icmp; 5274 struct T_capability_ack tca; 5275 struct sockaddr_in6 laddr, faddr; 5276 socklen_t laddrlen, faddrlen; 5277 short opts; 5278 struct stroptions *stropt; 5279 mblk_t *mp, *stropt_mp; 5280 int error; 5281 5282 icmp = connp->conn_icmp; 5283 5284 stropt_mp = allocb_wait(sizeof (*stropt), BPRI_HI, STR_NOSIG, NULL); 5285 5286 /* 5287 * setup the fallback stream that was allocated 5288 */ 5289 connp->conn_dev = (dev_t)RD(q)->q_ptr; 5290 connp->conn_minor_arena = WR(q)->q_ptr; 5291 5292 RD(q)->q_ptr = WR(q)->q_ptr = connp; 5293 5294 WR(q)->q_qinfo = &icmpwinit; 5295 5296 connp->conn_rq = RD(q); 5297 connp->conn_wq = WR(q); 5298 5299 /* Notify stream head about options before sending up data */ 5300 stropt_mp->b_datap->db_type = M_SETOPTS; 5301 stropt_mp->b_wptr += sizeof (*stropt); 5302 stropt = (struct stroptions *)stropt_mp->b_rptr; 5303 stropt->so_flags = SO_WROFF | SO_HIWAT; 5304 stropt->so_wroff = connp->conn_wroff; 5305 stropt->so_hiwat = connp->conn_rcvbuf; 5306 putnext(RD(q), stropt_mp); 5307 5308 /* 5309 * free helper stream 5310 */ 5311 ip_free_helper_stream(connp); 5312 5313 /* 5314 * Collect the information needed to sync with the sonode 5315 */ 5316 icmp_do_capability_ack(icmp, &tca, TC1_INFO); 5317 5318 laddrlen = faddrlen = sizeof (sin6_t); 5319 (void) rawip_getsockname((sock_lower_handle_t)connp, 5320 (struct sockaddr *)&laddr, &laddrlen, CRED()); 5321 error = rawip_getpeername((sock_lower_handle_t)connp, 5322 (struct sockaddr *)&faddr, &faddrlen, CRED()); 5323 if (error != 0) 5324 faddrlen = 0; 5325 opts = 0; 5326 if (connp->conn_dgram_errind) 5327 opts |= SO_DGRAM_ERRIND; 5328 if (connp->conn_ixa->ixa_flags & IXAF_DONTROUTE) 5329 opts |= SO_DONTROUTE; 5330 5331 mp = (*quiesced_cb)(connp->conn_upper_handle, arg, &tca, 5332 (struct sockaddr *)&laddr, laddrlen, 5333 (struct sockaddr *)&faddr, faddrlen, opts); 5334 5335 /* 5336 * Attempts to send data up during fallback will result in it being 5337 * queued in icmp_t. Now we push up any queued packets. 5338 */ 5339 mutex_enter(&icmp->icmp_recv_lock); 5340 if (mp != NULL) { 5341 mp->b_next = icmp->icmp_fallback_queue_head; 5342 icmp->icmp_fallback_queue_head = mp; 5343 } 5344 while (icmp->icmp_fallback_queue_head != NULL) { 5345 mp = icmp->icmp_fallback_queue_head; 5346 icmp->icmp_fallback_queue_head = mp->b_next; 5347 mp->b_next = NULL; 5348 mutex_exit(&icmp->icmp_recv_lock); 5349 putnext(RD(q), mp); 5350 mutex_enter(&icmp->icmp_recv_lock); 5351 } 5352 icmp->icmp_fallback_queue_tail = icmp->icmp_fallback_queue_head; 5353 5354 /* 5355 * No longer a streams less socket 5356 */ 5357 mutex_enter(&connp->conn_lock); 5358 connp->conn_flags &= ~IPCL_NONSTR; 5359 mutex_exit(&connp->conn_lock); 5360 5361 mutex_exit(&icmp->icmp_recv_lock); 5362 5363 ASSERT(icmp->icmp_fallback_queue_head == NULL && 5364 icmp->icmp_fallback_queue_tail == NULL); 5365 5366 ASSERT(connp->conn_ref >= 1); 5367 5368 return (0); 5369 } 5370 5371 /* ARGSUSED2 */ 5372 sock_lower_handle_t 5373 rawip_create(int family, int type, int proto, sock_downcalls_t **sock_downcalls, 5374 uint_t *smodep, int *errorp, int flags, cred_t *credp) 5375 { 5376 conn_t *connp; 5377 5378 if (type != SOCK_RAW || (family != AF_INET && family != AF_INET6)) { 5379 *errorp = EPROTONOSUPPORT; 5380 return (NULL); 5381 } 5382 5383 connp = rawip_do_open(family, credp, errorp, flags); 5384 if (connp != NULL) { 5385 connp->conn_flags |= IPCL_NONSTR; 5386 5387 mutex_enter(&connp->conn_lock); 5388 connp->conn_state_flags &= ~CONN_INCIPIENT; 5389 mutex_exit(&connp->conn_lock); 5390 *sock_downcalls = &sock_rawip_downcalls; 5391 *smodep = SM_ATOMIC; 5392 } else { 5393 ASSERT(*errorp != 0); 5394 } 5395 5396 return ((sock_lower_handle_t)connp); 5397 } 5398 5399 /* ARGSUSED3 */ 5400 void 5401 rawip_activate(sock_lower_handle_t proto_handle, 5402 sock_upper_handle_t sock_handle, sock_upcalls_t *sock_upcalls, int flags, 5403 cred_t *cr) 5404 { 5405 conn_t *connp = (conn_t *)proto_handle; 5406 struct sock_proto_props sopp; 5407 5408 /* All Solaris components should pass a cred for this operation. */ 5409 ASSERT(cr != NULL); 5410 5411 connp->conn_upcalls = sock_upcalls; 5412 connp->conn_upper_handle = sock_handle; 5413 5414 sopp.sopp_flags = SOCKOPT_WROFF | SOCKOPT_RCVHIWAT | SOCKOPT_RCVLOWAT | 5415 SOCKOPT_MAXBLK | SOCKOPT_MAXPSZ | SOCKOPT_MINPSZ; 5416 sopp.sopp_wroff = connp->conn_wroff; 5417 sopp.sopp_rxhiwat = connp->conn_rcvbuf; 5418 sopp.sopp_rxlowat = connp->conn_rcvlowat; 5419 sopp.sopp_maxblk = INFPSZ; 5420 sopp.sopp_maxpsz = IP_MAXPACKET; 5421 sopp.sopp_minpsz = (icmp_mod_info.mi_minpsz == 1) ? 0 : 5422 icmp_mod_info.mi_minpsz; 5423 5424 (*connp->conn_upcalls->su_set_proto_props) 5425 (connp->conn_upper_handle, &sopp); 5426 5427 icmp_bind_proto(connp->conn_icmp); 5428 } 5429 5430 /* ARGSUSED3 */ 5431 int 5432 rawip_getpeername(sock_lower_handle_t proto_handle, struct sockaddr *sa, 5433 socklen_t *salenp, cred_t *cr) 5434 { 5435 conn_t *connp = (conn_t *)proto_handle; 5436 icmp_t *icmp = connp->conn_icmp; 5437 int error; 5438 5439 /* All Solaris components should pass a cred for this operation. */ 5440 ASSERT(cr != NULL); 5441 5442 mutex_enter(&connp->conn_lock); 5443 if (icmp->icmp_state != TS_DATA_XFER) 5444 error = ENOTCONN; 5445 else 5446 error = conn_getpeername(connp, sa, salenp); 5447 mutex_exit(&connp->conn_lock); 5448 return (error); 5449 } 5450 5451 /* ARGSUSED3 */ 5452 int 5453 rawip_getsockname(sock_lower_handle_t proto_handle, struct sockaddr *sa, 5454 socklen_t *salenp, cred_t *cr) 5455 { 5456 conn_t *connp = (conn_t *)proto_handle; 5457 int error; 5458 5459 /* All Solaris components should pass a cred for this operation. */ 5460 ASSERT(cr != NULL); 5461 5462 mutex_enter(&connp->conn_lock); 5463 error = conn_getsockname(connp, sa, salenp); 5464 mutex_exit(&connp->conn_lock); 5465 return (error); 5466 } 5467 5468 int 5469 rawip_setsockopt(sock_lower_handle_t proto_handle, int level, int option_name, 5470 const void *optvalp, socklen_t optlen, cred_t *cr) 5471 { 5472 conn_t *connp = (conn_t *)proto_handle; 5473 int error; 5474 5475 /* All Solaris components should pass a cred for this operation. */ 5476 ASSERT(cr != NULL); 5477 5478 error = proto_opt_check(level, option_name, optlen, NULL, 5479 icmp_opt_obj.odb_opt_des_arr, 5480 icmp_opt_obj.odb_opt_arr_cnt, 5481 B_TRUE, B_FALSE, cr); 5482 5483 if (error != 0) { 5484 /* 5485 * option not recognized 5486 */ 5487 if (error < 0) { 5488 error = proto_tlitosyserr(-error); 5489 } 5490 return (error); 5491 } 5492 5493 error = icmp_opt_set(connp, SETFN_OPTCOM_NEGOTIATE, level, 5494 option_name, optlen, (uchar_t *)optvalp, (uint_t *)&optlen, 5495 (uchar_t *)optvalp, NULL, cr); 5496 5497 ASSERT(error >= 0); 5498 5499 return (error); 5500 } 5501 5502 int 5503 rawip_getsockopt(sock_lower_handle_t proto_handle, int level, int option_name, 5504 void *optvalp, socklen_t *optlen, cred_t *cr) 5505 { 5506 int error; 5507 conn_t *connp = (conn_t *)proto_handle; 5508 t_uscalar_t max_optbuf_len; 5509 void *optvalp_buf; 5510 int len; 5511 5512 /* All Solaris components should pass a cred for this operation. */ 5513 ASSERT(cr != NULL); 5514 5515 error = proto_opt_check(level, option_name, *optlen, &max_optbuf_len, 5516 icmp_opt_obj.odb_opt_des_arr, 5517 icmp_opt_obj.odb_opt_arr_cnt, 5518 B_FALSE, B_TRUE, cr); 5519 5520 if (error != 0) { 5521 if (error < 0) { 5522 error = proto_tlitosyserr(-error); 5523 } 5524 return (error); 5525 } 5526 5527 optvalp_buf = kmem_alloc(max_optbuf_len, KM_SLEEP); 5528 len = icmp_opt_get(connp, level, option_name, optvalp_buf); 5529 if (len == -1) { 5530 kmem_free(optvalp_buf, max_optbuf_len); 5531 return (EINVAL); 5532 } 5533 5534 /* 5535 * update optlen and copy option value 5536 */ 5537 t_uscalar_t size = MIN(len, *optlen); 5538 5539 bcopy(optvalp_buf, optvalp, size); 5540 bcopy(&size, optlen, sizeof (size)); 5541 5542 kmem_free(optvalp_buf, max_optbuf_len); 5543 return (0); 5544 } 5545 5546 /* ARGSUSED1 */ 5547 int 5548 rawip_close(sock_lower_handle_t proto_handle, int flags, cred_t *cr) 5549 { 5550 conn_t *connp = (conn_t *)proto_handle; 5551 5552 /* All Solaris components should pass a cred for this operation. */ 5553 ASSERT(cr != NULL); 5554 5555 (void) rawip_do_close(connp); 5556 return (0); 5557 } 5558 5559 /* ARGSUSED2 */ 5560 int 5561 rawip_shutdown(sock_lower_handle_t proto_handle, int how, cred_t *cr) 5562 { 5563 conn_t *connp = (conn_t *)proto_handle; 5564 5565 /* All Solaris components should pass a cred for this operation. */ 5566 ASSERT(cr != NULL); 5567 5568 /* shut down the send side */ 5569 if (how != SHUT_RD) 5570 (*connp->conn_upcalls->su_opctl)(connp->conn_upper_handle, 5571 SOCK_OPCTL_SHUT_SEND, 0); 5572 /* shut down the recv side */ 5573 if (how != SHUT_WR) 5574 (*connp->conn_upcalls->su_opctl)(connp->conn_upper_handle, 5575 SOCK_OPCTL_SHUT_RECV, 0); 5576 return (0); 5577 } 5578 5579 void 5580 rawip_clr_flowctrl(sock_lower_handle_t proto_handle) 5581 { 5582 conn_t *connp = (conn_t *)proto_handle; 5583 icmp_t *icmp = connp->conn_icmp; 5584 5585 mutex_enter(&icmp->icmp_recv_lock); 5586 connp->conn_flow_cntrld = B_FALSE; 5587 mutex_exit(&icmp->icmp_recv_lock); 5588 } 5589 5590 int 5591 rawip_ioctl(sock_lower_handle_t proto_handle, int cmd, intptr_t arg, 5592 int mode, int32_t *rvalp, cred_t *cr) 5593 { 5594 conn_t *connp = (conn_t *)proto_handle; 5595 int error; 5596 5597 /* All Solaris components should pass a cred for this operation. */ 5598 ASSERT(cr != NULL); 5599 5600 /* 5601 * If we don't have a helper stream then create one. 5602 * ip_create_helper_stream takes care of locking the conn_t, 5603 * so this check for NULL is just a performance optimization. 5604 */ 5605 if (connp->conn_helper_info == NULL) { 5606 icmp_stack_t *is = connp->conn_icmp->icmp_is; 5607 5608 ASSERT(is->is_ldi_ident != NULL); 5609 5610 /* 5611 * Create a helper stream for non-STREAMS socket. 5612 */ 5613 error = ip_create_helper_stream(connp, is->is_ldi_ident); 5614 if (error != 0) { 5615 ip0dbg(("rawip_ioctl: create of IP helper stream " 5616 "failed %d\n", error)); 5617 return (error); 5618 } 5619 } 5620 5621 switch (cmd) { 5622 case _SIOCSOCKFALLBACK: 5623 case TI_GETPEERNAME: 5624 case TI_GETMYNAME: 5625 #ifdef DEBUG 5626 cmn_err(CE_CONT, "icmp_ioctl cmd 0x%x on non streams" 5627 " socket", cmd); 5628 #endif 5629 error = EINVAL; 5630 break; 5631 default: 5632 /* 5633 * Pass on to IP using helper stream 5634 */ 5635 error = ldi_ioctl(connp->conn_helper_info->iphs_handle, 5636 cmd, arg, mode, cr, rvalp); 5637 break; 5638 } 5639 return (error); 5640 } 5641 5642 int 5643 rawip_send(sock_lower_handle_t proto_handle, mblk_t *mp, struct nmsghdr *msg, 5644 cred_t *cr) 5645 { 5646 sin6_t *sin6; 5647 sin_t *sin = NULL; 5648 uint_t srcid; 5649 conn_t *connp = (conn_t *)proto_handle; 5650 icmp_t *icmp = connp->conn_icmp; 5651 int error = 0; 5652 icmp_stack_t *is = icmp->icmp_is; 5653 pid_t pid = curproc->p_pid; 5654 ip_xmit_attr_t *ixa; 5655 5656 ASSERT(DB_TYPE(mp) == M_DATA); 5657 5658 /* All Solaris components should pass a cred for this operation. */ 5659 ASSERT(cr != NULL); 5660 5661 /* do an implicit bind if necessary */ 5662 if (icmp->icmp_state == TS_UNBND) { 5663 error = rawip_implicit_bind(connp); 5664 /* 5665 * We could be racing with an actual bind, in which case 5666 * we would see EPROTO. We cross our fingers and try 5667 * to connect. 5668 */ 5669 if (!(error == 0 || error == EPROTO)) { 5670 freemsg(mp); 5671 return (error); 5672 } 5673 } 5674 5675 /* Protocol 255 contains full IP headers */ 5676 /* Read without holding lock */ 5677 if (icmp->icmp_hdrincl) { 5678 ASSERT(connp->conn_ipversion == IPV4_VERSION); 5679 if (mp->b_wptr - mp->b_rptr < IP_SIMPLE_HDR_LENGTH) { 5680 if (!pullupmsg(mp, IP_SIMPLE_HDR_LENGTH)) { 5681 BUMP_MIB(&is->is_rawip_mib, rawipOutErrors); 5682 freemsg(mp); 5683 return (EINVAL); 5684 } 5685 } 5686 error = icmp_output_hdrincl(connp, mp, cr, pid); 5687 if (is->is_sendto_ignerr) 5688 return (0); 5689 else 5690 return (error); 5691 } 5692 5693 /* Connected? */ 5694 if (msg->msg_name == NULL) { 5695 if (icmp->icmp_state != TS_DATA_XFER) { 5696 BUMP_MIB(&is->is_rawip_mib, rawipOutErrors); 5697 return (EDESTADDRREQ); 5698 } 5699 if (msg->msg_controllen != 0) { 5700 error = icmp_output_ancillary(connp, NULL, NULL, mp, 5701 NULL, msg, cr, pid); 5702 } else { 5703 error = icmp_output_connected(connp, mp, cr, pid); 5704 } 5705 if (is->is_sendto_ignerr) 5706 return (0); 5707 else 5708 return (error); 5709 } 5710 if (icmp->icmp_state == TS_DATA_XFER) { 5711 BUMP_MIB(&is->is_rawip_mib, rawipOutErrors); 5712 return (EISCONN); 5713 } 5714 error = proto_verify_ip_addr(connp->conn_family, 5715 (struct sockaddr *)msg->msg_name, msg->msg_namelen); 5716 if (error != 0) { 5717 BUMP_MIB(&is->is_rawip_mib, rawipOutErrors); 5718 return (error); 5719 } 5720 switch (connp->conn_family) { 5721 case AF_INET6: 5722 sin6 = (sin6_t *)msg->msg_name; 5723 5724 /* No support for mapped addresses on raw sockets */ 5725 if (IN6_IS_ADDR_V4MAPPED(&sin6->sin6_addr)) { 5726 BUMP_MIB(&is->is_rawip_mib, rawipOutErrors); 5727 return (EADDRNOTAVAIL); 5728 } 5729 srcid = sin6->__sin6_src_id; 5730 5731 /* 5732 * If the local address is a mapped address return 5733 * an error. 5734 * It would be possible to send an IPv6 packet but the 5735 * response would never make it back to the application 5736 * since it is bound to a mapped address. 5737 */ 5738 if (IN6_IS_ADDR_V4MAPPED(&connp->conn_saddr_v6)) { 5739 BUMP_MIB(&is->is_rawip_mib, rawipOutErrors); 5740 return (EADDRNOTAVAIL); 5741 } 5742 5743 if (IN6_IS_ADDR_UNSPECIFIED(&sin6->sin6_addr)) 5744 sin6->sin6_addr = ipv6_loopback; 5745 5746 /* 5747 * We have to allocate an ip_xmit_attr_t before we grab 5748 * conn_lock and we need to hold conn_lock once we've check 5749 * conn_same_as_last_v6 to handle concurrent send* calls on a 5750 * socket. 5751 */ 5752 if (msg->msg_controllen == 0) { 5753 ixa = conn_get_ixa(connp, B_FALSE); 5754 if (ixa == NULL) { 5755 BUMP_MIB(&is->is_rawip_mib, rawipOutErrors); 5756 return (ENOMEM); 5757 } 5758 } else { 5759 ixa = NULL; 5760 } 5761 mutex_enter(&connp->conn_lock); 5762 if (icmp->icmp_delayed_error != 0) { 5763 sin6_t *sin2 = (sin6_t *)&icmp->icmp_delayed_addr; 5764 5765 error = icmp->icmp_delayed_error; 5766 icmp->icmp_delayed_error = 0; 5767 5768 /* Compare IP address and family */ 5769 5770 if (IN6_ARE_ADDR_EQUAL(&sin6->sin6_addr, 5771 &sin2->sin6_addr) && 5772 sin6->sin6_family == sin2->sin6_family) { 5773 mutex_exit(&connp->conn_lock); 5774 BUMP_MIB(&is->is_rawip_mib, rawipOutErrors); 5775 if (ixa != NULL) 5776 ixa_refrele(ixa); 5777 return (error); 5778 } 5779 } 5780 if (msg->msg_controllen != 0) { 5781 mutex_exit(&connp->conn_lock); 5782 ASSERT(ixa == NULL); 5783 error = icmp_output_ancillary(connp, NULL, sin6, mp, 5784 NULL, msg, cr, pid); 5785 } else if (conn_same_as_last_v6(connp, sin6) && 5786 connp->conn_lastsrcid == srcid && 5787 ipsec_outbound_policy_current(ixa)) { 5788 /* icmp_output_lastdst drops conn_lock */ 5789 error = icmp_output_lastdst(connp, mp, cr, pid, ixa); 5790 } else { 5791 /* icmp_output_newdst drops conn_lock */ 5792 error = icmp_output_newdst(connp, mp, NULL, sin6, cr, 5793 pid, ixa); 5794 } 5795 ASSERT(MUTEX_NOT_HELD(&connp->conn_lock)); 5796 if (is->is_sendto_ignerr) 5797 return (0); 5798 else 5799 return (error); 5800 case AF_INET: 5801 sin = (sin_t *)msg->msg_name; 5802 5803 if (sin->sin_addr.s_addr == INADDR_ANY) 5804 sin->sin_addr.s_addr = htonl(INADDR_LOOPBACK); 5805 5806 /* 5807 * We have to allocate an ip_xmit_attr_t before we grab 5808 * conn_lock and we need to hold conn_lock once we've check 5809 * conn_same_as_last_v6 to handle concurrent send* on a socket. 5810 */ 5811 if (msg->msg_controllen == 0) { 5812 ixa = conn_get_ixa(connp, B_FALSE); 5813 if (ixa == NULL) { 5814 BUMP_MIB(&is->is_rawip_mib, rawipOutErrors); 5815 return (ENOMEM); 5816 } 5817 } else { 5818 ixa = NULL; 5819 } 5820 mutex_enter(&connp->conn_lock); 5821 if (icmp->icmp_delayed_error != 0) { 5822 sin_t *sin2 = (sin_t *)&icmp->icmp_delayed_addr; 5823 5824 error = icmp->icmp_delayed_error; 5825 icmp->icmp_delayed_error = 0; 5826 5827 /* Compare IP address */ 5828 5829 if (sin->sin_addr.s_addr == sin2->sin_addr.s_addr) { 5830 mutex_exit(&connp->conn_lock); 5831 BUMP_MIB(&is->is_rawip_mib, rawipOutErrors); 5832 if (ixa != NULL) 5833 ixa_refrele(ixa); 5834 return (error); 5835 } 5836 } 5837 5838 if (msg->msg_controllen != 0) { 5839 mutex_exit(&connp->conn_lock); 5840 ASSERT(ixa == NULL); 5841 error = icmp_output_ancillary(connp, sin, NULL, mp, 5842 NULL, msg, cr, pid); 5843 } else if (conn_same_as_last_v4(connp, sin) && 5844 ipsec_outbound_policy_current(ixa)) { 5845 /* icmp_output_lastdst drops conn_lock */ 5846 error = icmp_output_lastdst(connp, mp, cr, pid, ixa); 5847 } else { 5848 /* icmp_output_newdst drops conn_lock */ 5849 error = icmp_output_newdst(connp, mp, sin, NULL, cr, 5850 pid, ixa); 5851 } 5852 ASSERT(MUTEX_NOT_HELD(&connp->conn_lock)); 5853 if (is->is_sendto_ignerr) 5854 return (0); 5855 else 5856 return (error); 5857 default: 5858 return (EINVAL); 5859 } 5860 } 5861 5862 sock_downcalls_t sock_rawip_downcalls = { 5863 rawip_activate, 5864 rawip_accept, 5865 rawip_bind, 5866 rawip_listen, 5867 rawip_connect, 5868 rawip_getpeername, 5869 rawip_getsockname, 5870 rawip_getsockopt, 5871 rawip_setsockopt, 5872 rawip_send, 5873 NULL, 5874 NULL, 5875 NULL, 5876 rawip_shutdown, 5877 rawip_clr_flowctrl, 5878 rawip_ioctl, 5879 rawip_close 5880 }; 5881