1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright (c) 1991, 2010, Oracle and/or its affiliates. All rights reserved. 23 * Copyright (c) 2013 by Delphix. All rights reserved. 24 * Copyright 2014, OmniTI Computer Consulting, Inc. All rights reserved. 25 * Copyright (c) 2018, Joyent, Inc. 26 */ 27 /* Copyright (c) 1990 Mentat Inc. */ 28 29 #include <sys/types.h> 30 #include <sys/stream.h> 31 #include <sys/stropts.h> 32 #include <sys/strlog.h> 33 #include <sys/strsun.h> 34 #define _SUN_TPI_VERSION 2 35 #include <sys/tihdr.h> 36 #include <sys/timod.h> 37 #include <sys/ddi.h> 38 #include <sys/sunddi.h> 39 #include <sys/strsubr.h> 40 #include <sys/suntpi.h> 41 #include <sys/xti_inet.h> 42 #include <sys/cmn_err.h> 43 #include <sys/kmem.h> 44 #include <sys/cred.h> 45 #include <sys/policy.h> 46 #include <sys/priv.h> 47 #include <sys/ucred.h> 48 #include <sys/zone.h> 49 50 #include <sys/sockio.h> 51 #include <sys/socket.h> 52 #include <sys/socketvar.h> 53 #include <sys/vtrace.h> 54 #include <sys/sdt.h> 55 #include <sys/debug.h> 56 #include <sys/isa_defs.h> 57 #include <sys/random.h> 58 #include <netinet/in.h> 59 #include <netinet/ip6.h> 60 #include <netinet/icmp6.h> 61 #include <netinet/udp.h> 62 63 #include <inet/common.h> 64 #include <inet/ip.h> 65 #include <inet/ip_impl.h> 66 #include <inet/ipsec_impl.h> 67 #include <inet/ip6.h> 68 #include <inet/ip_ire.h> 69 #include <inet/ip_if.h> 70 #include <inet/ip_multi.h> 71 #include <inet/ip_ndp.h> 72 #include <inet/proto_set.h> 73 #include <inet/mib2.h> 74 #include <inet/nd.h> 75 #include <inet/optcom.h> 76 #include <inet/snmpcom.h> 77 #include <inet/kstatcom.h> 78 #include <inet/ipclassifier.h> 79 80 #include <sys/tsol/label.h> 81 #include <sys/tsol/tnet.h> 82 83 #include <inet/rawip_impl.h> 84 85 #include <sys/disp.h> 86 87 /* 88 * Synchronization notes: 89 * 90 * RAWIP is MT and uses the usual kernel synchronization primitives. We use 91 * conn_lock to protect the icmp_t. 92 * 93 * Plumbing notes: 94 * ICMP is always a device driver. For compatibility with mibopen() code 95 * it is possible to I_PUSH "icmp", but that results in pushing a passthrough 96 * dummy module. 97 */ 98 static void icmp_addr_req(queue_t *q, mblk_t *mp); 99 static void icmp_tpi_bind(queue_t *q, mblk_t *mp); 100 static void icmp_bind_proto(icmp_t *icmp); 101 static int icmp_build_hdr_template(conn_t *, const in6_addr_t *, 102 const in6_addr_t *, uint32_t); 103 static void icmp_capability_req(queue_t *q, mblk_t *mp); 104 static int icmp_close(queue_t *q, int flags, cred_t *); 105 static void icmp_close_free(conn_t *); 106 static void icmp_tpi_connect(queue_t *q, mblk_t *mp); 107 static void icmp_tpi_disconnect(queue_t *q, mblk_t *mp); 108 static void icmp_err_ack(queue_t *q, mblk_t *mp, t_scalar_t t_error, 109 int sys_error); 110 static void icmp_err_ack_prim(queue_t *q, mblk_t *mp, t_scalar_t primitive, 111 t_scalar_t tlierr, int sys_error); 112 static void icmp_icmp_input(void *arg1, mblk_t *mp, void *arg2, 113 ip_recv_attr_t *); 114 static void icmp_icmp_error_ipv6(conn_t *connp, mblk_t *mp, 115 ip_recv_attr_t *); 116 static void icmp_info_req(queue_t *q, mblk_t *mp); 117 static void icmp_input(void *, mblk_t *, void *, ip_recv_attr_t *); 118 static conn_t *icmp_open(int family, cred_t *credp, int *err, int flags); 119 static int icmp_openv4(queue_t *q, dev_t *devp, int flag, int sflag, 120 cred_t *credp); 121 static int icmp_openv6(queue_t *q, dev_t *devp, int flag, int sflag, 122 cred_t *credp); 123 static boolean_t icmp_opt_allow_udr_set(t_scalar_t level, t_scalar_t name); 124 int icmp_opt_set(conn_t *connp, uint_t optset_context, 125 int level, int name, uint_t inlen, 126 uchar_t *invalp, uint_t *outlenp, uchar_t *outvalp, 127 void *thisdg_attrs, cred_t *cr); 128 int icmp_opt_get(conn_t *connp, int level, int name, 129 uchar_t *ptr); 130 static int icmp_output_newdst(conn_t *connp, mblk_t *data_mp, sin_t *sin, 131 sin6_t *sin6, cred_t *cr, pid_t pid, ip_xmit_attr_t *ixa); 132 static mblk_t *icmp_prepend_hdr(conn_t *, ip_xmit_attr_t *, const ip_pkt_t *, 133 const in6_addr_t *, const in6_addr_t *, uint32_t, mblk_t *, int *); 134 static mblk_t *icmp_prepend_header_template(conn_t *, ip_xmit_attr_t *, 135 mblk_t *, const in6_addr_t *, uint32_t, int *); 136 static int icmp_snmp_set(queue_t *q, t_scalar_t level, t_scalar_t name, 137 uchar_t *ptr, int len); 138 static void icmp_ud_err(queue_t *q, mblk_t *mp, t_scalar_t err); 139 static void icmp_tpi_unbind(queue_t *q, mblk_t *mp); 140 static int icmp_wput(queue_t *q, mblk_t *mp); 141 static int icmp_wput_fallback(queue_t *q, mblk_t *mp); 142 static void icmp_wput_other(queue_t *q, mblk_t *mp); 143 static void icmp_wput_iocdata(queue_t *q, mblk_t *mp); 144 static void icmp_wput_restricted(queue_t *q, mblk_t *mp); 145 static void icmp_ulp_recv(conn_t *, mblk_t *, uint_t); 146 147 static void *rawip_stack_init(netstackid_t stackid, netstack_t *ns); 148 static void rawip_stack_fini(netstackid_t stackid, void *arg); 149 150 static void *rawip_kstat_init(netstackid_t stackid); 151 static void rawip_kstat_fini(netstackid_t stackid, kstat_t *ksp); 152 static int rawip_kstat_update(kstat_t *kp, int rw); 153 static void rawip_stack_shutdown(netstackid_t stackid, void *arg); 154 155 /* Common routines for TPI and socket module */ 156 static conn_t *rawip_do_open(int, cred_t *, int *, int); 157 static void rawip_do_close(conn_t *); 158 static int rawip_do_bind(conn_t *, struct sockaddr *, socklen_t); 159 static int rawip_do_unbind(conn_t *); 160 static int rawip_do_connect(conn_t *, const struct sockaddr *, socklen_t, 161 cred_t *, pid_t); 162 163 int rawip_getsockname(sock_lower_handle_t, struct sockaddr *, 164 socklen_t *, cred_t *); 165 int rawip_getpeername(sock_lower_handle_t, struct sockaddr *, 166 socklen_t *, cred_t *); 167 168 static struct module_info icmp_mod_info = { 169 5707, "icmp", 1, INFPSZ, 512, 128 170 }; 171 172 /* 173 * Entry points for ICMP as a device. 174 * We have separate open functions for the /dev/icmp and /dev/icmp6 devices. 175 */ 176 static struct qinit icmprinitv4 = { 177 NULL, NULL, icmp_openv4, icmp_close, NULL, &icmp_mod_info 178 }; 179 180 static struct qinit icmprinitv6 = { 181 NULL, NULL, icmp_openv6, icmp_close, NULL, &icmp_mod_info 182 }; 183 184 static struct qinit icmpwinit = { 185 icmp_wput, ip_wsrv, NULL, NULL, NULL, &icmp_mod_info 186 }; 187 188 /* ICMP entry point during fallback */ 189 static struct qinit icmp_fallback_sock_winit = { 190 icmp_wput_fallback, NULL, NULL, NULL, NULL, &icmp_mod_info 191 }; 192 193 /* For AF_INET aka /dev/icmp */ 194 struct streamtab icmpinfov4 = { 195 &icmprinitv4, &icmpwinit 196 }; 197 198 /* For AF_INET6 aka /dev/icmp6 */ 199 struct streamtab icmpinfov6 = { 200 &icmprinitv6, &icmpwinit 201 }; 202 203 /* Default structure copied into T_INFO_ACK messages */ 204 static struct T_info_ack icmp_g_t_info_ack = { 205 T_INFO_ACK, 206 IP_MAXPACKET, /* TSDU_size. icmp allows maximum size messages. */ 207 T_INVALID, /* ETSDU_size. icmp does not support expedited data. */ 208 T_INVALID, /* CDATA_size. icmp does not support connect data. */ 209 T_INVALID, /* DDATA_size. icmp does not support disconnect data. */ 210 0, /* ADDR_size - filled in later. */ 211 0, /* OPT_size - not initialized here */ 212 IP_MAXPACKET, /* TIDU_size. icmp allows maximum size messages. */ 213 T_CLTS, /* SERV_type. icmp supports connection-less. */ 214 TS_UNBND, /* CURRENT_state. This is set from icmp_state. */ 215 (XPG4_1|SENDZERO) /* PROVIDER_flag */ 216 }; 217 218 static int 219 icmp_set_buf_prop(netstack_t *stack, cred_t *cr, mod_prop_info_t *pinfo, 220 const char *ifname, const void *pval, uint_t flags) 221 { 222 return (mod_set_buf_prop(stack->netstack_icmp->is_propinfo_tbl, 223 stack, cr, pinfo, ifname, pval, flags)); 224 } 225 226 static int 227 icmp_get_buf_prop(netstack_t *stack, mod_prop_info_t *pinfo, const char *ifname, 228 void *val, uint_t psize, uint_t flags) 229 { 230 return (mod_get_buf_prop(stack->netstack_icmp->is_propinfo_tbl, stack, 231 pinfo, ifname, val, psize, flags)); 232 } 233 234 /* 235 * All of these are alterable, within the min/max values given, at run time. 236 * 237 * Note: All those tunables which do not start with "icmp_" are Committed and 238 * therefore are public. See PSARC 2010/080. 239 */ 240 static mod_prop_info_t icmp_propinfo_tbl[] = { 241 /* tunable - 0 */ 242 { "_wroff_extra", MOD_PROTO_RAWIP, 243 mod_set_uint32, mod_get_uint32, 244 {0, 128, 32}, {32} }, 245 246 { "_ipv4_ttl", MOD_PROTO_RAWIP, 247 mod_set_uint32, mod_get_uint32, 248 {1, 255, 255}, {255} }, 249 250 { "_ipv6_hoplimit", MOD_PROTO_RAWIP, 251 mod_set_uint32, mod_get_uint32, 252 {0, IPV6_MAX_HOPS, IPV6_DEFAULT_HOPS}, 253 {IPV6_DEFAULT_HOPS} }, 254 255 { "_bsd_compat", MOD_PROTO_RAWIP, 256 mod_set_boolean, mod_get_boolean, 257 {B_TRUE}, {B_TRUE} }, 258 259 { "send_buf", MOD_PROTO_RAWIP, 260 icmp_set_buf_prop, icmp_get_buf_prop, 261 {4096, 65536, 8192}, {8192} }, 262 263 { "_xmit_lowat", MOD_PROTO_RAWIP, 264 mod_set_uint32, mod_get_uint32, 265 {0, 65536, 1024}, {1024} }, 266 267 { "recv_buf", MOD_PROTO_RAWIP, 268 icmp_set_buf_prop, icmp_get_buf_prop, 269 {4096, 65536, 8192}, {8192} }, 270 271 { "max_buf", MOD_PROTO_RAWIP, 272 mod_set_uint32, mod_get_uint32, 273 {65536, ULP_MAX_BUF, 256*1024}, {256*1024} }, 274 275 { "_pmtu_discovery", MOD_PROTO_RAWIP, 276 mod_set_boolean, mod_get_boolean, 277 {B_FALSE}, {B_FALSE} }, 278 279 { "_sendto_ignerr", MOD_PROTO_RAWIP, 280 mod_set_boolean, mod_get_boolean, 281 {B_FALSE}, {B_FALSE} }, 282 283 { "?", MOD_PROTO_RAWIP, NULL, mod_get_allprop, {0}, {0} }, 284 285 { NULL, 0, NULL, NULL, {0}, {0} } 286 }; 287 288 #define is_wroff_extra is_propinfo_tbl[0].prop_cur_uval 289 #define is_ipv4_ttl is_propinfo_tbl[1].prop_cur_uval 290 #define is_ipv6_hoplimit is_propinfo_tbl[2].prop_cur_uval 291 #define is_bsd_compat is_propinfo_tbl[3].prop_cur_bval 292 #define is_xmit_hiwat is_propinfo_tbl[4].prop_cur_uval 293 #define is_xmit_lowat is_propinfo_tbl[5].prop_cur_uval 294 #define is_recv_hiwat is_propinfo_tbl[6].prop_cur_uval 295 #define is_max_buf is_propinfo_tbl[7].prop_cur_uval 296 #define is_pmtu_discovery is_propinfo_tbl[8].prop_cur_bval 297 #define is_sendto_ignerr is_propinfo_tbl[9].prop_cur_bval 298 299 typedef union T_primitives *t_primp_t; 300 301 /* 302 * This routine is called to handle each O_T_BIND_REQ/T_BIND_REQ message 303 * passed to icmp_wput. 304 * It calls IP to verify the local IP address, and calls IP to insert 305 * the conn_t in the fanout table. 306 * If everything is ok it then sends the T_BIND_ACK back up. 307 */ 308 static void 309 icmp_tpi_bind(queue_t *q, mblk_t *mp) 310 { 311 int error; 312 struct sockaddr *sa; 313 struct T_bind_req *tbr; 314 socklen_t len; 315 sin_t *sin; 316 sin6_t *sin6; 317 icmp_t *icmp; 318 conn_t *connp = Q_TO_CONN(q); 319 mblk_t *mp1; 320 cred_t *cr; 321 322 /* 323 * All Solaris components should pass a db_credp 324 * for this TPI message, hence we ASSERT. 325 * But in case there is some other M_PROTO that looks 326 * like a TPI message sent by some other kernel 327 * component, we check and return an error. 328 */ 329 cr = msg_getcred(mp, NULL); 330 ASSERT(cr != NULL); 331 if (cr == NULL) { 332 icmp_err_ack(q, mp, TSYSERR, EINVAL); 333 return; 334 } 335 336 icmp = connp->conn_icmp; 337 if ((mp->b_wptr - mp->b_rptr) < sizeof (*tbr)) { 338 (void) mi_strlog(q, 1, SL_ERROR|SL_TRACE, 339 "icmp_bind: bad req, len %u", 340 (uint_t)(mp->b_wptr - mp->b_rptr)); 341 icmp_err_ack(q, mp, TPROTO, 0); 342 return; 343 } 344 345 if (icmp->icmp_state != TS_UNBND) { 346 (void) mi_strlog(q, 1, SL_ERROR|SL_TRACE, 347 "icmp_bind: bad state, %u", icmp->icmp_state); 348 icmp_err_ack(q, mp, TOUTSTATE, 0); 349 return; 350 } 351 352 /* 353 * Reallocate the message to make sure we have enough room for an 354 * address. 355 */ 356 mp1 = reallocb(mp, sizeof (struct T_bind_ack) + sizeof (sin6_t), 1); 357 if (mp1 == NULL) { 358 icmp_err_ack(q, mp, TSYSERR, ENOMEM); 359 return; 360 } 361 mp = mp1; 362 363 /* Reset the message type in preparation for shipping it back. */ 364 DB_TYPE(mp) = M_PCPROTO; 365 tbr = (struct T_bind_req *)mp->b_rptr; 366 len = tbr->ADDR_length; 367 switch (len) { 368 case 0: /* request for a generic port */ 369 tbr->ADDR_offset = sizeof (struct T_bind_req); 370 if (connp->conn_family == AF_INET) { 371 tbr->ADDR_length = sizeof (sin_t); 372 sin = (sin_t *)&tbr[1]; 373 *sin = sin_null; 374 sin->sin_family = AF_INET; 375 mp->b_wptr = (uchar_t *)&sin[1]; 376 sa = (struct sockaddr *)sin; 377 len = sizeof (sin_t); 378 } else { 379 ASSERT(connp->conn_family == AF_INET6); 380 tbr->ADDR_length = sizeof (sin6_t); 381 sin6 = (sin6_t *)&tbr[1]; 382 *sin6 = sin6_null; 383 sin6->sin6_family = AF_INET6; 384 mp->b_wptr = (uchar_t *)&sin6[1]; 385 sa = (struct sockaddr *)sin6; 386 len = sizeof (sin6_t); 387 } 388 break; 389 390 case sizeof (sin_t): /* Complete IPv4 address */ 391 sa = (struct sockaddr *)mi_offset_param(mp, tbr->ADDR_offset, 392 sizeof (sin_t)); 393 break; 394 395 case sizeof (sin6_t): /* Complete IPv6 address */ 396 sa = (struct sockaddr *)mi_offset_param(mp, 397 tbr->ADDR_offset, sizeof (sin6_t)); 398 break; 399 400 default: 401 (void) mi_strlog(q, 1, SL_ERROR|SL_TRACE, 402 "icmp_bind: bad ADDR_length %u", tbr->ADDR_length); 403 icmp_err_ack(q, mp, TBADADDR, 0); 404 return; 405 } 406 407 error = rawip_do_bind(connp, sa, len); 408 if (error != 0) { 409 if (error > 0) { 410 icmp_err_ack(q, mp, TSYSERR, error); 411 } else { 412 icmp_err_ack(q, mp, -error, 0); 413 } 414 } else { 415 tbr->PRIM_type = T_BIND_ACK; 416 qreply(q, mp); 417 } 418 } 419 420 static int 421 rawip_do_bind(conn_t *connp, struct sockaddr *sa, socklen_t len) 422 { 423 sin_t *sin; 424 sin6_t *sin6; 425 icmp_t *icmp = connp->conn_icmp; 426 int error = 0; 427 ip_laddr_t laddr_type = IPVL_UNICAST_UP; /* INADDR_ANY */ 428 in_port_t lport; /* Network byte order */ 429 ipaddr_t v4src; /* Set if AF_INET */ 430 in6_addr_t v6src; 431 uint_t scopeid = 0; 432 zoneid_t zoneid = IPCL_ZONEID(connp); 433 ip_stack_t *ipst = connp->conn_netstack->netstack_ip; 434 435 if (sa == NULL || !OK_32PTR((char *)sa)) { 436 return (EINVAL); 437 } 438 439 switch (len) { 440 case sizeof (sin_t): /* Complete IPv4 address */ 441 sin = (sin_t *)sa; 442 if (sin->sin_family != AF_INET || 443 connp->conn_family != AF_INET) { 444 /* TSYSERR, EAFNOSUPPORT */ 445 return (EAFNOSUPPORT); 446 } 447 v4src = sin->sin_addr.s_addr; 448 IN6_IPADDR_TO_V4MAPPED(v4src, &v6src); 449 if (v4src != INADDR_ANY) { 450 laddr_type = ip_laddr_verify_v4(v4src, zoneid, ipst, 451 B_TRUE); 452 } 453 lport = sin->sin_port; 454 break; 455 case sizeof (sin6_t): /* Complete IPv6 address */ 456 sin6 = (sin6_t *)sa; 457 if (sin6->sin6_family != AF_INET6 || 458 connp->conn_family != AF_INET6) { 459 /* TSYSERR, EAFNOSUPPORT */ 460 return (EAFNOSUPPORT); 461 } 462 /* No support for mapped addresses on raw sockets */ 463 if (IN6_IS_ADDR_V4MAPPED(&sin6->sin6_addr)) { 464 /* TSYSERR, EADDRNOTAVAIL */ 465 return (EADDRNOTAVAIL); 466 } 467 v6src = sin6->sin6_addr; 468 if (!IN6_IS_ADDR_UNSPECIFIED(&v6src)) { 469 if (IN6_IS_ADDR_LINKSCOPE(&v6src)) 470 scopeid = sin6->sin6_scope_id; 471 laddr_type = ip_laddr_verify_v6(&v6src, zoneid, ipst, 472 B_TRUE, scopeid); 473 } 474 lport = sin6->sin6_port; 475 break; 476 477 default: 478 /* TBADADDR */ 479 return (EADDRNOTAVAIL); 480 } 481 482 /* Is the local address a valid unicast, multicast, or broadcast? */ 483 if (laddr_type == IPVL_BAD) 484 return (EADDRNOTAVAIL); 485 486 /* 487 * The state must be TS_UNBND. 488 */ 489 mutex_enter(&connp->conn_lock); 490 if (icmp->icmp_state != TS_UNBND) { 491 mutex_exit(&connp->conn_lock); 492 return (-TOUTSTATE); 493 } 494 495 /* 496 * Copy the source address into our icmp structure. This address 497 * may still be zero; if so, ip will fill in the correct address 498 * each time an outbound packet is passed to it. 499 * If we are binding to a broadcast or multicast address then 500 * we just set the conn_bound_addr since we don't want to use 501 * that as the source address when sending. 502 */ 503 connp->conn_bound_addr_v6 = v6src; 504 connp->conn_laddr_v6 = v6src; 505 if (scopeid != 0) { 506 connp->conn_ixa->ixa_flags |= IXAF_SCOPEID_SET; 507 connp->conn_ixa->ixa_scopeid = scopeid; 508 connp->conn_incoming_ifindex = scopeid; 509 } else { 510 connp->conn_ixa->ixa_flags &= ~IXAF_SCOPEID_SET; 511 connp->conn_incoming_ifindex = connp->conn_bound_if; 512 } 513 514 switch (laddr_type) { 515 case IPVL_UNICAST_UP: 516 case IPVL_UNICAST_DOWN: 517 connp->conn_saddr_v6 = v6src; 518 connp->conn_mcbc_bind = B_FALSE; 519 break; 520 case IPVL_MCAST: 521 case IPVL_BCAST: 522 /* ip_set_destination will pick a source address later */ 523 connp->conn_saddr_v6 = ipv6_all_zeros; 524 connp->conn_mcbc_bind = B_TRUE; 525 break; 526 } 527 528 /* Any errors after this point should use late_error */ 529 530 /* 531 * Use sin_port/sin6_port since applications like psh use SOCK_RAW 532 * with IPPROTO_TCP. 533 */ 534 connp->conn_lport = lport; 535 connp->conn_fport = 0; 536 537 if (connp->conn_family == AF_INET) { 538 ASSERT(connp->conn_ipversion == IPV4_VERSION); 539 } else { 540 ASSERT(connp->conn_ipversion == IPV6_VERSION); 541 } 542 543 icmp->icmp_state = TS_IDLE; 544 545 /* 546 * We create an initial header template here to make a subsequent 547 * sendto have a starting point. Since conn_last_dst is zero the 548 * first sendto will always follow the 'dst changed' code path. 549 * Note that we defer massaging options and the related checksum 550 * adjustment until we have a destination address. 551 */ 552 error = icmp_build_hdr_template(connp, &connp->conn_saddr_v6, 553 &connp->conn_faddr_v6, connp->conn_flowinfo); 554 if (error != 0) { 555 mutex_exit(&connp->conn_lock); 556 goto late_error; 557 } 558 /* Just in case */ 559 connp->conn_faddr_v6 = ipv6_all_zeros; 560 connp->conn_v6lastdst = ipv6_all_zeros; 561 mutex_exit(&connp->conn_lock); 562 563 error = ip_laddr_fanout_insert(connp); 564 if (error != 0) 565 goto late_error; 566 567 /* Bind succeeded */ 568 return (0); 569 570 late_error: 571 mutex_enter(&connp->conn_lock); 572 connp->conn_saddr_v6 = ipv6_all_zeros; 573 connp->conn_bound_addr_v6 = ipv6_all_zeros; 574 connp->conn_laddr_v6 = ipv6_all_zeros; 575 if (scopeid != 0) { 576 connp->conn_ixa->ixa_flags &= ~IXAF_SCOPEID_SET; 577 connp->conn_incoming_ifindex = connp->conn_bound_if; 578 } 579 icmp->icmp_state = TS_UNBND; 580 connp->conn_v6lastdst = ipv6_all_zeros; 581 connp->conn_lport = 0; 582 583 /* Restore the header that was built above - different source address */ 584 (void) icmp_build_hdr_template(connp, &connp->conn_saddr_v6, 585 &connp->conn_faddr_v6, connp->conn_flowinfo); 586 mutex_exit(&connp->conn_lock); 587 return (error); 588 } 589 590 /* 591 * Tell IP to just bind to the protocol. 592 */ 593 static void 594 icmp_bind_proto(icmp_t *icmp) 595 { 596 conn_t *connp = icmp->icmp_connp; 597 598 mutex_enter(&connp->conn_lock); 599 connp->conn_saddr_v6 = ipv6_all_zeros; 600 connp->conn_laddr_v6 = ipv6_all_zeros; 601 connp->conn_faddr_v6 = ipv6_all_zeros; 602 connp->conn_v6lastdst = ipv6_all_zeros; 603 mutex_exit(&connp->conn_lock); 604 605 (void) ip_laddr_fanout_insert(connp); 606 } 607 608 /* 609 * This routine handles each T_CONN_REQ message passed to icmp. It 610 * associates a default destination address with the stream. 611 * 612 * After various error checks are completed, icmp_connect() lays 613 * the target address and port into the composite header template. 614 * Then we ask IP for information, including a source address if we didn't 615 * already have one. Finally we send up the T_OK_ACK reply message. 616 */ 617 static void 618 icmp_tpi_connect(queue_t *q, mblk_t *mp) 619 { 620 conn_t *connp = Q_TO_CONN(q); 621 struct T_conn_req *tcr; 622 struct sockaddr *sa; 623 socklen_t len; 624 int error; 625 cred_t *cr; 626 pid_t pid; 627 /* 628 * All Solaris components should pass a db_credp 629 * for this TPI message, hence we ASSERT. 630 * But in case there is some other M_PROTO that looks 631 * like a TPI message sent by some other kernel 632 * component, we check and return an error. 633 */ 634 cr = msg_getcred(mp, &pid); 635 ASSERT(cr != NULL); 636 if (cr == NULL) { 637 icmp_err_ack(q, mp, TSYSERR, EINVAL); 638 return; 639 } 640 641 tcr = (struct T_conn_req *)mp->b_rptr; 642 /* Sanity checks */ 643 if ((mp->b_wptr - mp->b_rptr) < sizeof (struct T_conn_req)) { 644 icmp_err_ack(q, mp, TPROTO, 0); 645 return; 646 } 647 648 if (tcr->OPT_length != 0) { 649 icmp_err_ack(q, mp, TBADOPT, 0); 650 return; 651 } 652 653 len = tcr->DEST_length; 654 655 switch (len) { 656 default: 657 icmp_err_ack(q, mp, TBADADDR, 0); 658 return; 659 case sizeof (sin_t): 660 sa = (struct sockaddr *)mi_offset_param(mp, tcr->DEST_offset, 661 sizeof (sin_t)); 662 break; 663 case sizeof (sin6_t): 664 sa = (struct sockaddr *)mi_offset_param(mp, 665 tcr->DEST_offset, sizeof (sin6_t)); 666 break; 667 } 668 669 error = proto_verify_ip_addr(connp->conn_family, sa, len); 670 if (error != 0) { 671 icmp_err_ack(q, mp, TSYSERR, error); 672 return; 673 } 674 675 error = rawip_do_connect(connp, sa, len, cr, pid); 676 if (error != 0) { 677 if (error < 0) { 678 icmp_err_ack(q, mp, -error, 0); 679 } else { 680 icmp_err_ack(q, mp, 0, error); 681 } 682 } else { 683 mblk_t *mp1; 684 685 /* 686 * We have to send a connection confirmation to 687 * keep TLI happy. 688 */ 689 if (connp->conn_family == AF_INET) { 690 mp1 = mi_tpi_conn_con(NULL, (char *)sa, 691 sizeof (sin_t), NULL, 0); 692 } else { 693 ASSERT(connp->conn_family == AF_INET6); 694 mp1 = mi_tpi_conn_con(NULL, (char *)sa, 695 sizeof (sin6_t), NULL, 0); 696 } 697 if (mp1 == NULL) { 698 icmp_err_ack(q, mp, TSYSERR, ENOMEM); 699 return; 700 } 701 702 /* 703 * Send ok_ack for T_CONN_REQ 704 */ 705 mp = mi_tpi_ok_ack_alloc(mp); 706 if (mp == NULL) { 707 /* Unable to reuse the T_CONN_REQ for the ack. */ 708 icmp_err_ack_prim(q, mp1, T_CONN_REQ, TSYSERR, ENOMEM); 709 return; 710 } 711 putnext(connp->conn_rq, mp); 712 putnext(connp->conn_rq, mp1); 713 } 714 } 715 716 static int 717 rawip_do_connect(conn_t *connp, const struct sockaddr *sa, socklen_t len, 718 cred_t *cr, pid_t pid) 719 { 720 icmp_t *icmp; 721 sin_t *sin; 722 sin6_t *sin6; 723 int error; 724 uint16_t dstport; 725 ipaddr_t v4dst; 726 in6_addr_t v6dst; 727 uint32_t flowinfo; 728 ip_xmit_attr_t *ixa; 729 ip_xmit_attr_t *oldixa; 730 uint_t scopeid = 0; 731 uint_t srcid = 0; 732 in6_addr_t v6src = connp->conn_saddr_v6; 733 734 icmp = connp->conn_icmp; 735 736 if (sa == NULL || !OK_32PTR((char *)sa)) { 737 return (EINVAL); 738 } 739 740 ASSERT(sa != NULL && len != 0); 741 742 /* 743 * Determine packet type based on type of address passed in 744 * the request should contain an IPv4 or IPv6 address. 745 * Make sure that address family matches the type of 746 * family of the address passed down. 747 */ 748 switch (len) { 749 case sizeof (sin_t): 750 sin = (sin_t *)sa; 751 752 v4dst = sin->sin_addr.s_addr; 753 dstport = sin->sin_port; 754 IN6_IPADDR_TO_V4MAPPED(v4dst, &v6dst); 755 ASSERT(connp->conn_ipversion == IPV4_VERSION); 756 break; 757 758 case sizeof (sin6_t): 759 sin6 = (sin6_t *)sa; 760 761 /* No support for mapped addresses on raw sockets */ 762 if (IN6_IS_ADDR_V4MAPPED(&sin6->sin6_addr)) { 763 return (EADDRNOTAVAIL); 764 } 765 v6dst = sin6->sin6_addr; 766 dstport = sin6->sin6_port; 767 ASSERT(connp->conn_ipversion == IPV6_VERSION); 768 flowinfo = sin6->sin6_flowinfo; 769 if (IN6_IS_ADDR_LINKLOCAL(&sin6->sin6_addr)) 770 scopeid = sin6->sin6_scope_id; 771 srcid = sin6->__sin6_src_id; 772 if (srcid != 0 && IN6_IS_ADDR_UNSPECIFIED(&v6src)) { 773 /* Due to check above, we know sin6_addr is v6-only. */ 774 if (!ip_srcid_find_id(srcid, &v6src, IPCL_ZONEID(connp), 775 B_FALSE, connp->conn_netstack)) { 776 /* Mismatch - v6src would be v4mapped. */ 777 return (EADDRNOTAVAIL); 778 } 779 } 780 break; 781 } 782 783 /* 784 * If there is a different thread using conn_ixa then we get a new 785 * copy and cut the old one loose from conn_ixa. Otherwise we use 786 * conn_ixa and prevent any other thread from using/changing it. 787 * Once connect() is done other threads can use conn_ixa since the 788 * refcnt will be back at one. 789 * We defer updating conn_ixa until later to handle any concurrent 790 * conn_ixa_cleanup thread. 791 */ 792 ixa = conn_get_ixa(connp, B_FALSE); 793 if (ixa == NULL) 794 return (ENOMEM); 795 796 mutex_enter(&connp->conn_lock); 797 /* 798 * This icmp_t must have bound already before doing a connect. 799 * Reject if a connect is in progress (we drop conn_lock during 800 * rawip_do_connect). 801 */ 802 if (icmp->icmp_state == TS_UNBND || icmp->icmp_state == TS_WCON_CREQ) { 803 mutex_exit(&connp->conn_lock); 804 ixa_refrele(ixa); 805 return (-TOUTSTATE); 806 } 807 808 if (icmp->icmp_state == TS_DATA_XFER) { 809 /* Already connected - clear out state */ 810 if (connp->conn_mcbc_bind) 811 connp->conn_saddr_v6 = ipv6_all_zeros; 812 else 813 connp->conn_saddr_v6 = connp->conn_bound_addr_v6; 814 connp->conn_laddr_v6 = connp->conn_bound_addr_v6; 815 connp->conn_faddr_v6 = ipv6_all_zeros; 816 icmp->icmp_state = TS_IDLE; 817 } 818 819 /* 820 * Use sin_port/sin6_port since applications like psh use SOCK_RAW 821 * with IPPROTO_TCP. 822 */ 823 connp->conn_fport = dstport; 824 if (connp->conn_ipversion == IPV4_VERSION) { 825 /* 826 * Interpret a zero destination to mean loopback. 827 * Update the T_CONN_REQ (sin/sin6) since it is used to 828 * generate the T_CONN_CON. 829 */ 830 if (v4dst == INADDR_ANY) { 831 v4dst = htonl(INADDR_LOOPBACK); 832 IN6_IPADDR_TO_V4MAPPED(v4dst, &v6dst); 833 ASSERT(connp->conn_family == AF_INET); 834 sin->sin_addr.s_addr = v4dst; 835 } 836 connp->conn_faddr_v6 = v6dst; 837 connp->conn_flowinfo = 0; 838 } else { 839 ASSERT(connp->conn_ipversion == IPV6_VERSION); 840 /* 841 * Interpret a zero destination to mean loopback. 842 * Update the T_CONN_REQ (sin/sin6) since it is used to 843 * generate the T_CONN_CON. 844 */ 845 if (IN6_IS_ADDR_UNSPECIFIED(&v6dst)) { 846 v6dst = ipv6_loopback; 847 sin6->sin6_addr = v6dst; 848 } 849 connp->conn_faddr_v6 = v6dst; 850 connp->conn_flowinfo = flowinfo; 851 } 852 853 /* 854 * We update our cred/cpid based on the caller of connect 855 */ 856 if (connp->conn_cred != cr) { 857 crhold(cr); 858 crfree(connp->conn_cred); 859 connp->conn_cred = cr; 860 } 861 connp->conn_cpid = pid; 862 ASSERT(!(ixa->ixa_free_flags & IXA_FREE_CRED)); 863 ixa->ixa_cred = cr; 864 ixa->ixa_cpid = pid; 865 if (is_system_labeled()) { 866 /* We need to restart with a label based on the cred */ 867 ip_xmit_attr_restore_tsl(ixa, ixa->ixa_cred); 868 } 869 870 if (scopeid != 0) { 871 ixa->ixa_flags |= IXAF_SCOPEID_SET; 872 ixa->ixa_scopeid = scopeid; 873 connp->conn_incoming_ifindex = scopeid; 874 } else { 875 ixa->ixa_flags &= ~IXAF_SCOPEID_SET; 876 connp->conn_incoming_ifindex = connp->conn_bound_if; 877 } 878 879 /* 880 * conn_connect will drop conn_lock and reacquire it. 881 * To prevent a send* from messing with this icmp_t while the lock 882 * is dropped we set icmp_state and clear conn_v6lastdst. 883 * That will make all send* fail with EISCONN. 884 */ 885 connp->conn_v6lastdst = ipv6_all_zeros; 886 icmp->icmp_state = TS_WCON_CREQ; 887 888 error = conn_connect(connp, NULL, IPDF_ALLOW_MCBC); 889 mutex_exit(&connp->conn_lock); 890 if (error != 0) 891 goto connect_failed; 892 893 /* 894 * The addresses have been verified. Time to insert in 895 * the correct fanout list. 896 */ 897 error = ipcl_conn_insert(connp); 898 if (error != 0) 899 goto connect_failed; 900 901 mutex_enter(&connp->conn_lock); 902 error = icmp_build_hdr_template(connp, &connp->conn_saddr_v6, 903 &connp->conn_faddr_v6, connp->conn_flowinfo); 904 if (error != 0) { 905 mutex_exit(&connp->conn_lock); 906 goto connect_failed; 907 } 908 909 icmp->icmp_state = TS_DATA_XFER; 910 /* Record this as the "last" send even though we haven't sent any */ 911 connp->conn_v6lastdst = connp->conn_faddr_v6; 912 connp->conn_lastipversion = connp->conn_ipversion; 913 connp->conn_lastdstport = connp->conn_fport; 914 connp->conn_lastflowinfo = connp->conn_flowinfo; 915 connp->conn_lastscopeid = scopeid; 916 connp->conn_lastsrcid = srcid; 917 /* Also remember a source to use together with lastdst */ 918 connp->conn_v6lastsrc = v6src; 919 920 oldixa = conn_replace_ixa(connp, ixa); 921 mutex_exit(&connp->conn_lock); 922 ixa_refrele(oldixa); 923 924 ixa_refrele(ixa); 925 return (0); 926 927 connect_failed: 928 if (ixa != NULL) 929 ixa_refrele(ixa); 930 mutex_enter(&connp->conn_lock); 931 icmp->icmp_state = TS_IDLE; 932 /* In case the source address was set above */ 933 if (connp->conn_mcbc_bind) 934 connp->conn_saddr_v6 = ipv6_all_zeros; 935 else 936 connp->conn_saddr_v6 = connp->conn_bound_addr_v6; 937 connp->conn_laddr_v6 = connp->conn_bound_addr_v6; 938 connp->conn_faddr_v6 = ipv6_all_zeros; 939 connp->conn_v6lastdst = ipv6_all_zeros; 940 connp->conn_flowinfo = 0; 941 942 (void) icmp_build_hdr_template(connp, &connp->conn_saddr_v6, 943 &connp->conn_faddr_v6, connp->conn_flowinfo); 944 mutex_exit(&connp->conn_lock); 945 return (error); 946 } 947 948 static void 949 rawip_do_close(conn_t *connp) 950 { 951 ASSERT(connp != NULL && IPCL_IS_RAWIP(connp)); 952 953 ip_quiesce_conn(connp); 954 955 if (!IPCL_IS_NONSTR(connp)) { 956 qprocsoff(connp->conn_rq); 957 } 958 959 icmp_close_free(connp); 960 961 /* 962 * Now we are truly single threaded on this stream, and can 963 * delete the things hanging off the connp, and finally the connp. 964 * We removed this connp from the fanout list, it cannot be 965 * accessed thru the fanouts, and we already waited for the 966 * conn_ref to drop to 0. We are already in close, so 967 * there cannot be any other thread from the top. qprocsoff 968 * has completed, and service has completed or won't run in 969 * future. 970 */ 971 ASSERT(connp->conn_ref == 1); 972 973 if (!IPCL_IS_NONSTR(connp)) { 974 inet_minor_free(connp->conn_minor_arena, connp->conn_dev); 975 } else { 976 ip_free_helper_stream(connp); 977 } 978 979 connp->conn_ref--; 980 ipcl_conn_destroy(connp); 981 } 982 983 /* ARGSUSED */ 984 static int 985 icmp_close(queue_t *q, int flags, cred_t *credp __unused) 986 { 987 conn_t *connp; 988 989 if (flags & SO_FALLBACK) { 990 /* 991 * stream is being closed while in fallback 992 * simply free the resources that were allocated 993 */ 994 inet_minor_free(WR(q)->q_ptr, (dev_t)(RD(q)->q_ptr)); 995 qprocsoff(q); 996 goto done; 997 } 998 999 connp = Q_TO_CONN(q); 1000 (void) rawip_do_close(connp); 1001 done: 1002 q->q_ptr = WR(q)->q_ptr = NULL; 1003 return (0); 1004 } 1005 1006 static void 1007 icmp_close_free(conn_t *connp) 1008 { 1009 icmp_t *icmp = connp->conn_icmp; 1010 1011 if (icmp->icmp_filter != NULL) { 1012 kmem_free(icmp->icmp_filter, sizeof (icmp6_filter_t)); 1013 icmp->icmp_filter = NULL; 1014 } 1015 1016 /* 1017 * Clear any fields which the kmem_cache constructor clears. 1018 * Only icmp_connp needs to be preserved. 1019 * TBD: We should make this more efficient to avoid clearing 1020 * everything. 1021 */ 1022 ASSERT(icmp->icmp_connp == connp); 1023 bzero(icmp, sizeof (icmp_t)); 1024 icmp->icmp_connp = connp; 1025 } 1026 1027 /* 1028 * This routine handles each T_DISCON_REQ message passed to icmp 1029 * as an indicating that ICMP is no longer connected. This results 1030 * in telling IP to restore the binding to just the local address. 1031 */ 1032 static int 1033 icmp_do_disconnect(conn_t *connp) 1034 { 1035 icmp_t *icmp = connp->conn_icmp; 1036 int error; 1037 1038 mutex_enter(&connp->conn_lock); 1039 if (icmp->icmp_state != TS_DATA_XFER) { 1040 mutex_exit(&connp->conn_lock); 1041 return (-TOUTSTATE); 1042 } 1043 if (connp->conn_mcbc_bind) 1044 connp->conn_saddr_v6 = ipv6_all_zeros; 1045 else 1046 connp->conn_saddr_v6 = connp->conn_bound_addr_v6; 1047 connp->conn_laddr_v6 = connp->conn_bound_addr_v6; 1048 connp->conn_faddr_v6 = ipv6_all_zeros; 1049 icmp->icmp_state = TS_IDLE; 1050 1051 connp->conn_v6lastdst = ipv6_all_zeros; 1052 error = icmp_build_hdr_template(connp, &connp->conn_saddr_v6, 1053 &connp->conn_faddr_v6, connp->conn_flowinfo); 1054 mutex_exit(&connp->conn_lock); 1055 if (error != 0) 1056 return (error); 1057 1058 /* 1059 * Tell IP to remove the full binding and revert 1060 * to the local address binding. 1061 */ 1062 return (ip_laddr_fanout_insert(connp)); 1063 } 1064 1065 static void 1066 icmp_tpi_disconnect(queue_t *q, mblk_t *mp) 1067 { 1068 conn_t *connp = Q_TO_CONN(q); 1069 int error; 1070 1071 /* 1072 * Allocate the largest primitive we need to send back 1073 * T_error_ack is > than T_ok_ack 1074 */ 1075 mp = reallocb(mp, sizeof (struct T_error_ack), 1); 1076 if (mp == NULL) { 1077 /* Unable to reuse the T_DISCON_REQ for the ack. */ 1078 icmp_err_ack_prim(q, mp, T_DISCON_REQ, TSYSERR, ENOMEM); 1079 return; 1080 } 1081 1082 error = icmp_do_disconnect(connp); 1083 1084 if (error != 0) { 1085 if (error > 0) { 1086 icmp_err_ack(q, mp, 0, error); 1087 } else { 1088 icmp_err_ack(q, mp, -error, 0); 1089 } 1090 } else { 1091 mp = mi_tpi_ok_ack_alloc(mp); 1092 ASSERT(mp != NULL); 1093 qreply(q, mp); 1094 } 1095 } 1096 1097 static int 1098 icmp_disconnect(conn_t *connp) 1099 { 1100 int error; 1101 1102 connp->conn_dgram_errind = B_FALSE; 1103 1104 error = icmp_do_disconnect(connp); 1105 1106 if (error < 0) 1107 error = proto_tlitosyserr(-error); 1108 return (error); 1109 } 1110 1111 /* This routine creates a T_ERROR_ACK message and passes it upstream. */ 1112 static void 1113 icmp_err_ack(queue_t *q, mblk_t *mp, t_scalar_t t_error, int sys_error) 1114 { 1115 if ((mp = mi_tpi_err_ack_alloc(mp, t_error, sys_error)) != NULL) 1116 qreply(q, mp); 1117 } 1118 1119 /* Shorthand to generate and send TPI error acks to our client */ 1120 static void 1121 icmp_err_ack_prim(queue_t *q, mblk_t *mp, t_scalar_t primitive, 1122 t_scalar_t t_error, int sys_error) 1123 { 1124 struct T_error_ack *teackp; 1125 1126 if ((mp = tpi_ack_alloc(mp, sizeof (struct T_error_ack), 1127 M_PCPROTO, T_ERROR_ACK)) != NULL) { 1128 teackp = (struct T_error_ack *)mp->b_rptr; 1129 teackp->ERROR_prim = primitive; 1130 teackp->TLI_error = t_error; 1131 teackp->UNIX_error = sys_error; 1132 qreply(q, mp); 1133 } 1134 } 1135 1136 /* 1137 * icmp_icmp_input is called as conn_recvicmp to process ICMP messages. 1138 * Generates the appropriate T_UDERROR_IND for permanent (non-transient) errors. 1139 * Assumes that IP has pulled up everything up to and including the ICMP header. 1140 */ 1141 /* ARGSUSED2 */ 1142 static void 1143 icmp_icmp_input(void *arg1, mblk_t *mp, void *arg2, ip_recv_attr_t *ira) 1144 { 1145 conn_t *connp = (conn_t *)arg1; 1146 icmp_t *icmp = connp->conn_icmp; 1147 icmph_t *icmph; 1148 ipha_t *ipha; 1149 int iph_hdr_length; 1150 sin_t sin; 1151 mblk_t *mp1; 1152 int error = 0; 1153 1154 ipha = (ipha_t *)mp->b_rptr; 1155 1156 ASSERT(OK_32PTR(mp->b_rptr)); 1157 1158 if (IPH_HDR_VERSION(ipha) != IPV4_VERSION) { 1159 ASSERT(IPH_HDR_VERSION(ipha) == IPV6_VERSION); 1160 icmp_icmp_error_ipv6(connp, mp, ira); 1161 return; 1162 } 1163 ASSERT(IPH_HDR_VERSION(ipha) == IPV4_VERSION); 1164 1165 /* Skip past the outer IP and ICMP headers */ 1166 ASSERT(IPH_HDR_LENGTH(ipha) == ira->ira_ip_hdr_length); 1167 iph_hdr_length = ira->ira_ip_hdr_length; 1168 icmph = (icmph_t *)&mp->b_rptr[iph_hdr_length]; 1169 ipha = (ipha_t *)&icmph[1]; /* Inner IP header */ 1170 1171 iph_hdr_length = IPH_HDR_LENGTH(ipha); 1172 1173 switch (icmph->icmph_type) { 1174 case ICMP_DEST_UNREACHABLE: 1175 switch (icmph->icmph_code) { 1176 case ICMP_FRAGMENTATION_NEEDED: { 1177 ipha_t *ipha; 1178 ip_xmit_attr_t *ixa; 1179 /* 1180 * IP has already adjusted the path MTU. 1181 * But we need to adjust DF for IPv4. 1182 */ 1183 if (connp->conn_ipversion != IPV4_VERSION) 1184 break; 1185 1186 ixa = conn_get_ixa(connp, B_FALSE); 1187 if (ixa == NULL || ixa->ixa_ire == NULL) { 1188 /* 1189 * Some other thread holds conn_ixa. We will 1190 * redo this on the next ICMP too big. 1191 */ 1192 if (ixa != NULL) 1193 ixa_refrele(ixa); 1194 break; 1195 } 1196 (void) ip_get_pmtu(ixa); 1197 1198 mutex_enter(&connp->conn_lock); 1199 ipha = (ipha_t *)connp->conn_ht_iphc; 1200 if (ixa->ixa_flags & IXAF_PMTU_IPV4_DF) { 1201 ipha->ipha_fragment_offset_and_flags |= 1202 IPH_DF_HTONS; 1203 } else { 1204 ipha->ipha_fragment_offset_and_flags &= 1205 ~IPH_DF_HTONS; 1206 } 1207 mutex_exit(&connp->conn_lock); 1208 ixa_refrele(ixa); 1209 break; 1210 } 1211 case ICMP_PORT_UNREACHABLE: 1212 case ICMP_PROTOCOL_UNREACHABLE: 1213 error = ECONNREFUSED; 1214 break; 1215 default: 1216 /* Transient errors */ 1217 break; 1218 } 1219 break; 1220 default: 1221 /* Transient errors */ 1222 break; 1223 } 1224 if (error == 0) { 1225 freemsg(mp); 1226 return; 1227 } 1228 1229 /* 1230 * Deliver T_UDERROR_IND when the application has asked for it. 1231 * The socket layer enables this automatically when connected. 1232 */ 1233 if (!connp->conn_dgram_errind) { 1234 freemsg(mp); 1235 return; 1236 } 1237 1238 sin = sin_null; 1239 sin.sin_family = AF_INET; 1240 sin.sin_addr.s_addr = ipha->ipha_dst; 1241 1242 if (IPCL_IS_NONSTR(connp)) { 1243 mutex_enter(&connp->conn_lock); 1244 if (icmp->icmp_state == TS_DATA_XFER) { 1245 if (sin.sin_addr.s_addr == connp->conn_faddr_v4) { 1246 mutex_exit(&connp->conn_lock); 1247 (*connp->conn_upcalls->su_set_error) 1248 (connp->conn_upper_handle, error); 1249 goto done; 1250 } 1251 } else { 1252 icmp->icmp_delayed_error = error; 1253 *((sin_t *)&icmp->icmp_delayed_addr) = sin; 1254 } 1255 mutex_exit(&connp->conn_lock); 1256 } else { 1257 mp1 = mi_tpi_uderror_ind((char *)&sin, sizeof (sin_t), NULL, 0, 1258 error); 1259 if (mp1 != NULL) 1260 putnext(connp->conn_rq, mp1); 1261 } 1262 done: 1263 freemsg(mp); 1264 } 1265 1266 /* 1267 * icmp_icmp_error_ipv6 is called by icmp_icmp_error to process ICMP for IPv6. 1268 * Generates the appropriate T_UDERROR_IND for permanent (non-transient) errors. 1269 * Assumes that IP has pulled up all the extension headers as well as the 1270 * ICMPv6 header. 1271 */ 1272 static void 1273 icmp_icmp_error_ipv6(conn_t *connp, mblk_t *mp, ip_recv_attr_t *ira) 1274 { 1275 icmp6_t *icmp6; 1276 ip6_t *ip6h, *outer_ip6h; 1277 uint16_t iph_hdr_length; 1278 uint8_t *nexthdrp; 1279 sin6_t sin6; 1280 mblk_t *mp1; 1281 int error = 0; 1282 icmp_t *icmp = connp->conn_icmp; 1283 1284 outer_ip6h = (ip6_t *)mp->b_rptr; 1285 #ifdef DEBUG 1286 if (outer_ip6h->ip6_nxt != IPPROTO_ICMPV6) 1287 iph_hdr_length = ip_hdr_length_v6(mp, outer_ip6h); 1288 else 1289 iph_hdr_length = IPV6_HDR_LEN; 1290 ASSERT(iph_hdr_length == ira->ira_ip_hdr_length); 1291 #endif 1292 /* Skip past the outer IP and ICMP headers */ 1293 iph_hdr_length = ira->ira_ip_hdr_length; 1294 icmp6 = (icmp6_t *)&mp->b_rptr[iph_hdr_length]; 1295 1296 ip6h = (ip6_t *)&icmp6[1]; /* Inner IP header */ 1297 if (!ip_hdr_length_nexthdr_v6(mp, ip6h, &iph_hdr_length, &nexthdrp)) { 1298 freemsg(mp); 1299 return; 1300 } 1301 1302 switch (icmp6->icmp6_type) { 1303 case ICMP6_DST_UNREACH: 1304 switch (icmp6->icmp6_code) { 1305 case ICMP6_DST_UNREACH_NOPORT: 1306 error = ECONNREFUSED; 1307 break; 1308 case ICMP6_DST_UNREACH_ADMIN: 1309 case ICMP6_DST_UNREACH_NOROUTE: 1310 case ICMP6_DST_UNREACH_BEYONDSCOPE: 1311 case ICMP6_DST_UNREACH_ADDR: 1312 /* Transient errors */ 1313 break; 1314 default: 1315 break; 1316 } 1317 break; 1318 case ICMP6_PACKET_TOO_BIG: { 1319 struct T_unitdata_ind *tudi; 1320 struct T_opthdr *toh; 1321 size_t udi_size; 1322 mblk_t *newmp; 1323 t_scalar_t opt_length = sizeof (struct T_opthdr) + 1324 sizeof (struct ip6_mtuinfo); 1325 sin6_t *sin6; 1326 struct ip6_mtuinfo *mtuinfo; 1327 1328 /* 1329 * If the application has requested to receive path mtu 1330 * information, send up an empty message containing an 1331 * IPV6_PATHMTU ancillary data item. 1332 */ 1333 if (!connp->conn_ipv6_recvpathmtu) 1334 break; 1335 1336 udi_size = sizeof (struct T_unitdata_ind) + sizeof (sin6_t) + 1337 opt_length; 1338 if ((newmp = allocb(udi_size, BPRI_MED)) == NULL) { 1339 BUMP_MIB(&icmp->icmp_is->is_rawip_mib, rawipInErrors); 1340 break; 1341 } 1342 1343 /* 1344 * newmp->b_cont is left to NULL on purpose. This is an 1345 * empty message containing only ancillary data. 1346 */ 1347 newmp->b_datap->db_type = M_PROTO; 1348 tudi = (struct T_unitdata_ind *)newmp->b_rptr; 1349 newmp->b_wptr = (uchar_t *)tudi + udi_size; 1350 tudi->PRIM_type = T_UNITDATA_IND; 1351 tudi->SRC_length = sizeof (sin6_t); 1352 tudi->SRC_offset = sizeof (struct T_unitdata_ind); 1353 tudi->OPT_offset = tudi->SRC_offset + sizeof (sin6_t); 1354 tudi->OPT_length = opt_length; 1355 1356 sin6 = (sin6_t *)&tudi[1]; 1357 bzero(sin6, sizeof (sin6_t)); 1358 sin6->sin6_family = AF_INET6; 1359 sin6->sin6_addr = connp->conn_faddr_v6; 1360 1361 toh = (struct T_opthdr *)&sin6[1]; 1362 toh->level = IPPROTO_IPV6; 1363 toh->name = IPV6_PATHMTU; 1364 toh->len = opt_length; 1365 toh->status = 0; 1366 1367 mtuinfo = (struct ip6_mtuinfo *)&toh[1]; 1368 bzero(mtuinfo, sizeof (struct ip6_mtuinfo)); 1369 mtuinfo->ip6m_addr.sin6_family = AF_INET6; 1370 mtuinfo->ip6m_addr.sin6_addr = ip6h->ip6_dst; 1371 mtuinfo->ip6m_mtu = icmp6->icmp6_mtu; 1372 /* 1373 * We've consumed everything we need from the original 1374 * message. Free it, then send our empty message. 1375 */ 1376 freemsg(mp); 1377 icmp_ulp_recv(connp, newmp, msgdsize(newmp)); 1378 return; 1379 } 1380 case ICMP6_TIME_EXCEEDED: 1381 /* Transient errors */ 1382 break; 1383 case ICMP6_PARAM_PROB: 1384 /* If this corresponds to an ICMP_PROTOCOL_UNREACHABLE */ 1385 if (icmp6->icmp6_code == ICMP6_PARAMPROB_NEXTHEADER && 1386 (uchar_t *)ip6h + icmp6->icmp6_pptr == 1387 (uchar_t *)nexthdrp) { 1388 error = ECONNREFUSED; 1389 break; 1390 } 1391 break; 1392 } 1393 if (error == 0) { 1394 freemsg(mp); 1395 return; 1396 } 1397 1398 /* 1399 * Deliver T_UDERROR_IND when the application has asked for it. 1400 * The socket layer enables this automatically when connected. 1401 */ 1402 if (!connp->conn_dgram_errind) { 1403 freemsg(mp); 1404 return; 1405 } 1406 1407 sin6 = sin6_null; 1408 sin6.sin6_family = AF_INET6; 1409 sin6.sin6_addr = ip6h->ip6_dst; 1410 sin6.sin6_flowinfo = ip6h->ip6_vcf & ~IPV6_VERS_AND_FLOW_MASK; 1411 if (IPCL_IS_NONSTR(connp)) { 1412 mutex_enter(&connp->conn_lock); 1413 if (icmp->icmp_state == TS_DATA_XFER) { 1414 if (IN6_ARE_ADDR_EQUAL(&sin6.sin6_addr, 1415 &connp->conn_faddr_v6)) { 1416 mutex_exit(&connp->conn_lock); 1417 (*connp->conn_upcalls->su_set_error) 1418 (connp->conn_upper_handle, error); 1419 goto done; 1420 } 1421 } else { 1422 icmp->icmp_delayed_error = error; 1423 *((sin6_t *)&icmp->icmp_delayed_addr) = sin6; 1424 } 1425 mutex_exit(&connp->conn_lock); 1426 } else { 1427 mp1 = mi_tpi_uderror_ind((char *)&sin6, sizeof (sin6_t), 1428 NULL, 0, error); 1429 if (mp1 != NULL) 1430 putnext(connp->conn_rq, mp1); 1431 } 1432 done: 1433 freemsg(mp); 1434 } 1435 1436 /* 1437 * This routine responds to T_ADDR_REQ messages. It is called by icmp_wput. 1438 * The local address is filled in if endpoint is bound. The remote address 1439 * is filled in if remote address has been precified ("connected endpoint") 1440 * (The concept of connected CLTS sockets is alien to published TPI 1441 * but we support it anyway). 1442 */ 1443 static void 1444 icmp_addr_req(queue_t *q, mblk_t *mp) 1445 { 1446 struct sockaddr *sa; 1447 mblk_t *ackmp; 1448 struct T_addr_ack *taa; 1449 icmp_t *icmp = Q_TO_ICMP(q); 1450 conn_t *connp = icmp->icmp_connp; 1451 uint_t addrlen; 1452 1453 /* Make it large enough for worst case */ 1454 ackmp = reallocb(mp, sizeof (struct T_addr_ack) + 1455 2 * sizeof (sin6_t), 1); 1456 if (ackmp == NULL) { 1457 icmp_err_ack(q, mp, TSYSERR, ENOMEM); 1458 return; 1459 } 1460 taa = (struct T_addr_ack *)ackmp->b_rptr; 1461 1462 bzero(taa, sizeof (struct T_addr_ack)); 1463 ackmp->b_wptr = (uchar_t *)&taa[1]; 1464 1465 taa->PRIM_type = T_ADDR_ACK; 1466 ackmp->b_datap->db_type = M_PCPROTO; 1467 1468 if (connp->conn_family == AF_INET) 1469 addrlen = sizeof (sin_t); 1470 else 1471 addrlen = sizeof (sin6_t); 1472 1473 mutex_enter(&connp->conn_lock); 1474 /* 1475 * Note: Following code assumes 32 bit alignment of basic 1476 * data structures like sin_t and struct T_addr_ack. 1477 */ 1478 if (icmp->icmp_state != TS_UNBND) { 1479 /* 1480 * Fill in local address first 1481 */ 1482 taa->LOCADDR_offset = sizeof (*taa); 1483 taa->LOCADDR_length = addrlen; 1484 sa = (struct sockaddr *)&taa[1]; 1485 (void) conn_getsockname(connp, sa, &addrlen); 1486 ackmp->b_wptr += addrlen; 1487 } 1488 if (icmp->icmp_state == TS_DATA_XFER) { 1489 /* 1490 * connected, fill remote address too 1491 */ 1492 taa->REMADDR_length = addrlen; 1493 /* assumed 32-bit alignment */ 1494 taa->REMADDR_offset = taa->LOCADDR_offset + taa->LOCADDR_length; 1495 sa = (struct sockaddr *)(ackmp->b_rptr + taa->REMADDR_offset); 1496 (void) conn_getpeername(connp, sa, &addrlen); 1497 ackmp->b_wptr += addrlen; 1498 } 1499 mutex_exit(&connp->conn_lock); 1500 ASSERT(ackmp->b_wptr <= ackmp->b_datap->db_lim); 1501 qreply(q, ackmp); 1502 } 1503 1504 static void 1505 icmp_copy_info(struct T_info_ack *tap, icmp_t *icmp) 1506 { 1507 conn_t *connp = icmp->icmp_connp; 1508 1509 *tap = icmp_g_t_info_ack; 1510 1511 if (connp->conn_family == AF_INET6) 1512 tap->ADDR_size = sizeof (sin6_t); 1513 else 1514 tap->ADDR_size = sizeof (sin_t); 1515 tap->CURRENT_state = icmp->icmp_state; 1516 tap->OPT_size = icmp_max_optsize; 1517 } 1518 1519 static void 1520 icmp_do_capability_ack(icmp_t *icmp, struct T_capability_ack *tcap, 1521 t_uscalar_t cap_bits1) 1522 { 1523 tcap->CAP_bits1 = 0; 1524 1525 if (cap_bits1 & TC1_INFO) { 1526 icmp_copy_info(&tcap->INFO_ack, icmp); 1527 tcap->CAP_bits1 |= TC1_INFO; 1528 } 1529 } 1530 1531 /* 1532 * This routine responds to T_CAPABILITY_REQ messages. It is called by 1533 * icmp_wput. Much of the T_CAPABILITY_ACK information is copied from 1534 * icmp_g_t_info_ack. The current state of the stream is copied from 1535 * icmp_state. 1536 */ 1537 static void 1538 icmp_capability_req(queue_t *q, mblk_t *mp) 1539 { 1540 icmp_t *icmp = Q_TO_ICMP(q); 1541 t_uscalar_t cap_bits1; 1542 struct T_capability_ack *tcap; 1543 1544 cap_bits1 = ((struct T_capability_req *)mp->b_rptr)->CAP_bits1; 1545 1546 mp = tpi_ack_alloc(mp, sizeof (struct T_capability_ack), 1547 mp->b_datap->db_type, T_CAPABILITY_ACK); 1548 if (!mp) 1549 return; 1550 1551 tcap = (struct T_capability_ack *)mp->b_rptr; 1552 1553 icmp_do_capability_ack(icmp, tcap, cap_bits1); 1554 1555 qreply(q, mp); 1556 } 1557 1558 /* 1559 * This routine responds to T_INFO_REQ messages. It is called by icmp_wput. 1560 * Most of the T_INFO_ACK information is copied from icmp_g_t_info_ack. 1561 * The current state of the stream is copied from icmp_state. 1562 */ 1563 static void 1564 icmp_info_req(queue_t *q, mblk_t *mp) 1565 { 1566 icmp_t *icmp = Q_TO_ICMP(q); 1567 1568 /* Create a T_INFO_ACK message. */ 1569 mp = tpi_ack_alloc(mp, sizeof (struct T_info_ack), M_PCPROTO, 1570 T_INFO_ACK); 1571 if (!mp) 1572 return; 1573 icmp_copy_info((struct T_info_ack *)mp->b_rptr, icmp); 1574 qreply(q, mp); 1575 } 1576 1577 static int 1578 icmp_tpi_open(queue_t *q, dev_t *devp, int flag, int sflag, cred_t *credp, 1579 int family) 1580 { 1581 conn_t *connp; 1582 dev_t conn_dev; 1583 int error; 1584 1585 /* If the stream is already open, return immediately. */ 1586 if (q->q_ptr != NULL) 1587 return (0); 1588 1589 if (sflag == MODOPEN) 1590 return (EINVAL); 1591 1592 /* 1593 * Since ICMP is not used so heavily, allocating from the small 1594 * arena should be sufficient. 1595 */ 1596 if ((conn_dev = inet_minor_alloc(ip_minor_arena_sa)) == 0) { 1597 return (EBUSY); 1598 } 1599 1600 if (flag & SO_FALLBACK) { 1601 /* 1602 * Non streams socket needs a stream to fallback to 1603 */ 1604 RD(q)->q_ptr = (void *)conn_dev; 1605 WR(q)->q_qinfo = &icmp_fallback_sock_winit; 1606 WR(q)->q_ptr = (void *)ip_minor_arena_sa; 1607 qprocson(q); 1608 return (0); 1609 } 1610 1611 connp = rawip_do_open(family, credp, &error, KM_SLEEP); 1612 if (connp == NULL) { 1613 ASSERT(error != 0); 1614 inet_minor_free(ip_minor_arena_sa, conn_dev); 1615 return (error); 1616 } 1617 1618 *devp = makedevice(getemajor(*devp), (minor_t)conn_dev); 1619 connp->conn_dev = conn_dev; 1620 connp->conn_minor_arena = ip_minor_arena_sa; 1621 1622 /* 1623 * Initialize the icmp_t structure for this stream. 1624 */ 1625 q->q_ptr = connp; 1626 WR(q)->q_ptr = connp; 1627 connp->conn_rq = q; 1628 connp->conn_wq = WR(q); 1629 1630 WR(q)->q_hiwat = connp->conn_sndbuf; 1631 WR(q)->q_lowat = connp->conn_sndlowat; 1632 1633 qprocson(q); 1634 1635 /* Set the Stream head write offset. */ 1636 (void) proto_set_tx_wroff(q, connp, connp->conn_wroff); 1637 (void) proto_set_rx_hiwat(connp->conn_rq, connp, connp->conn_rcvbuf); 1638 1639 mutex_enter(&connp->conn_lock); 1640 connp->conn_state_flags &= ~CONN_INCIPIENT; 1641 mutex_exit(&connp->conn_lock); 1642 1643 icmp_bind_proto(connp->conn_icmp); 1644 1645 return (0); 1646 } 1647 1648 /* For /dev/icmp aka AF_INET open */ 1649 static int 1650 icmp_openv4(queue_t *q, dev_t *devp, int flag, int sflag, cred_t *credp) 1651 { 1652 return (icmp_tpi_open(q, devp, flag, sflag, credp, AF_INET)); 1653 } 1654 1655 /* For /dev/icmp6 aka AF_INET6 open */ 1656 static int 1657 icmp_openv6(queue_t *q, dev_t *devp, int flag, int sflag, cred_t *credp) 1658 { 1659 return (icmp_tpi_open(q, devp, flag, sflag, credp, AF_INET6)); 1660 } 1661 1662 /* 1663 * This is the open routine for icmp. It allocates a icmp_t structure for 1664 * the stream and, on the first open of the module, creates an ND table. 1665 */ 1666 static conn_t * 1667 rawip_do_open(int family, cred_t *credp, int *err, int flags) 1668 { 1669 icmp_t *icmp; 1670 conn_t *connp; 1671 zoneid_t zoneid; 1672 netstack_t *ns; 1673 icmp_stack_t *is; 1674 int len; 1675 boolean_t isv6 = B_FALSE; 1676 1677 *err = secpolicy_net_icmpaccess(credp); 1678 if (*err != 0) 1679 return (NULL); 1680 1681 if (family == AF_INET6) 1682 isv6 = B_TRUE; 1683 1684 ns = netstack_find_by_cred(credp); 1685 ASSERT(ns != NULL); 1686 is = ns->netstack_icmp; 1687 ASSERT(is != NULL); 1688 1689 /* 1690 * For exclusive stacks we set the zoneid to zero 1691 * to make ICMP operate as if in the global zone. 1692 */ 1693 if (ns->netstack_stackid != GLOBAL_NETSTACKID) 1694 zoneid = GLOBAL_ZONEID; 1695 else 1696 zoneid = crgetzoneid(credp); 1697 1698 ASSERT(flags == KM_SLEEP || flags == KM_NOSLEEP); 1699 1700 connp = ipcl_conn_create(IPCL_RAWIPCONN, flags, ns); 1701 icmp = connp->conn_icmp; 1702 1703 /* 1704 * ipcl_conn_create did a netstack_hold. Undo the hold that was 1705 * done by netstack_find_by_cred() 1706 */ 1707 netstack_rele(ns); 1708 1709 /* 1710 * Since this conn_t/icmp_t is not yet visible to anybody else we don't 1711 * need to lock anything. 1712 */ 1713 ASSERT(connp->conn_proto == IPPROTO_ICMP); 1714 ASSERT(connp->conn_icmp == icmp); 1715 ASSERT(icmp->icmp_connp == connp); 1716 1717 /* Set the initial state of the stream and the privilege status. */ 1718 icmp->icmp_state = TS_UNBND; 1719 connp->conn_ixa->ixa_flags |= IXAF_VERIFY_SOURCE; 1720 if (isv6) { 1721 connp->conn_family = AF_INET6; 1722 connp->conn_ipversion = IPV6_VERSION; 1723 connp->conn_ixa->ixa_flags &= ~IXAF_IS_IPV4; 1724 connp->conn_proto = IPPROTO_ICMPV6; 1725 /* May be changed by a SO_PROTOTYPE socket option. */ 1726 connp->conn_proto = IPPROTO_ICMPV6; 1727 connp->conn_ixa->ixa_protocol = connp->conn_proto; 1728 connp->conn_ixa->ixa_raw_cksum_offset = 2; 1729 connp->conn_default_ttl = is->is_ipv6_hoplimit; 1730 len = sizeof (ip6_t); 1731 } else { 1732 connp->conn_family = AF_INET; 1733 connp->conn_ipversion = IPV4_VERSION; 1734 connp->conn_ixa->ixa_flags |= IXAF_IS_IPV4; 1735 /* May be changed by a SO_PROTOTYPE socket option. */ 1736 connp->conn_proto = IPPROTO_ICMP; 1737 connp->conn_ixa->ixa_protocol = connp->conn_proto; 1738 connp->conn_default_ttl = is->is_ipv4_ttl; 1739 len = sizeof (ipha_t); 1740 } 1741 connp->conn_xmit_ipp.ipp_unicast_hops = connp->conn_default_ttl; 1742 1743 connp->conn_ixa->ixa_multicast_ttl = IP_DEFAULT_MULTICAST_TTL; 1744 1745 /* 1746 * For the socket of protocol IPPROTO_RAW or when IP_HDRINCL is set, 1747 * the checksum is provided in the pre-built packet. We clear 1748 * IXAF_SET_ULP_CKSUM to tell IP that the application has sent a 1749 * complete IP header and not to compute the transport checksum. 1750 */ 1751 connp->conn_ixa->ixa_flags |= IXAF_MULTICAST_LOOP | IXAF_SET_ULP_CKSUM; 1752 /* conn_allzones can not be set this early, hence no IPCL_ZONEID */ 1753 connp->conn_ixa->ixa_zoneid = zoneid; 1754 1755 connp->conn_zoneid = zoneid; 1756 1757 /* 1758 * If the caller has the process-wide flag set, then default to MAC 1759 * exempt mode. This allows read-down to unlabeled hosts. 1760 */ 1761 if (getpflags(NET_MAC_AWARE, credp) != 0) 1762 connp->conn_mac_mode = CONN_MAC_AWARE; 1763 1764 connp->conn_zone_is_global = (crgetzoneid(credp) == GLOBAL_ZONEID); 1765 1766 icmp->icmp_is = is; 1767 1768 connp->conn_rcvbuf = is->is_recv_hiwat; 1769 connp->conn_sndbuf = is->is_xmit_hiwat; 1770 connp->conn_sndlowat = is->is_xmit_lowat; 1771 connp->conn_rcvlowat = icmp_mod_info.mi_lowat; 1772 1773 connp->conn_wroff = len + is->is_wroff_extra; 1774 connp->conn_so_type = SOCK_RAW; 1775 1776 connp->conn_recv = icmp_input; 1777 connp->conn_recvicmp = icmp_icmp_input; 1778 crhold(credp); 1779 connp->conn_cred = credp; 1780 connp->conn_cpid = curproc->p_pid; 1781 connp->conn_open_time = ddi_get_lbolt64(); 1782 /* Cache things in ixa without an extra refhold */ 1783 ASSERT(!(connp->conn_ixa->ixa_free_flags & IXA_FREE_CRED)); 1784 connp->conn_ixa->ixa_cred = connp->conn_cred; 1785 connp->conn_ixa->ixa_cpid = connp->conn_cpid; 1786 if (is_system_labeled()) 1787 connp->conn_ixa->ixa_tsl = crgetlabel(connp->conn_cred); 1788 1789 connp->conn_flow_cntrld = B_FALSE; 1790 1791 if (is->is_pmtu_discovery) 1792 connp->conn_ixa->ixa_flags |= IXAF_PMTU_DISCOVERY; 1793 1794 return (connp); 1795 } 1796 1797 /* 1798 * Which ICMP options OK to set through T_UNITDATA_REQ... 1799 */ 1800 /* ARGSUSED */ 1801 static boolean_t 1802 icmp_opt_allow_udr_set(t_scalar_t level, t_scalar_t name) 1803 { 1804 return (B_TRUE); 1805 } 1806 1807 /* 1808 * This routine gets default values of certain options whose default 1809 * values are maintained by protcol specific code 1810 */ 1811 int 1812 icmp_opt_default(queue_t *q, t_scalar_t level, t_scalar_t name, uchar_t *ptr) 1813 { 1814 icmp_t *icmp = Q_TO_ICMP(q); 1815 icmp_stack_t *is = icmp->icmp_is; 1816 int *i1 = (int *)ptr; 1817 1818 switch (level) { 1819 case IPPROTO_IP: 1820 switch (name) { 1821 case IP_MULTICAST_TTL: 1822 *ptr = (uchar_t)IP_DEFAULT_MULTICAST_TTL; 1823 return (sizeof (uchar_t)); 1824 case IP_MULTICAST_LOOP: 1825 *ptr = (uchar_t)IP_DEFAULT_MULTICAST_LOOP; 1826 return (sizeof (uchar_t)); 1827 } 1828 break; 1829 case IPPROTO_IPV6: 1830 switch (name) { 1831 case IPV6_MULTICAST_HOPS: 1832 *i1 = IP_DEFAULT_MULTICAST_TTL; 1833 return (sizeof (int)); 1834 case IPV6_MULTICAST_LOOP: 1835 *i1 = IP_DEFAULT_MULTICAST_LOOP; 1836 return (sizeof (int)); 1837 case IPV6_UNICAST_HOPS: 1838 *i1 = is->is_ipv6_hoplimit; 1839 return (sizeof (int)); 1840 } 1841 break; 1842 case IPPROTO_ICMPV6: 1843 switch (name) { 1844 case ICMP6_FILTER: 1845 /* Make it look like "pass all" */ 1846 ICMP6_FILTER_SETPASSALL((icmp6_filter_t *)ptr); 1847 return (sizeof (icmp6_filter_t)); 1848 } 1849 break; 1850 } 1851 return (-1); 1852 } 1853 1854 /* 1855 * This routine retrieves the current status of socket options. 1856 * It returns the size of the option retrieved, or -1. 1857 */ 1858 int 1859 icmp_opt_get(conn_t *connp, int level, int name, uchar_t *ptr) 1860 { 1861 icmp_t *icmp = connp->conn_icmp; 1862 int *i1 = (int *)ptr; 1863 conn_opt_arg_t coas; 1864 int retval; 1865 1866 coas.coa_connp = connp; 1867 coas.coa_ixa = connp->conn_ixa; 1868 coas.coa_ipp = &connp->conn_xmit_ipp; 1869 coas.coa_ancillary = B_FALSE; 1870 coas.coa_changed = 0; 1871 1872 /* 1873 * We assume that the optcom framework has checked for the set 1874 * of levels and names that are supported, hence we don't worry 1875 * about rejecting based on that. 1876 * First check for ICMP specific handling, then pass to common routine. 1877 */ 1878 switch (level) { 1879 case IPPROTO_IP: 1880 /* 1881 * Only allow IPv4 option processing on IPv4 sockets. 1882 */ 1883 if (connp->conn_family != AF_INET) 1884 return (-1); 1885 1886 switch (name) { 1887 case IP_OPTIONS: 1888 case T_IP_OPTIONS: 1889 /* Options are passed up with each packet */ 1890 return (0); 1891 case IP_HDRINCL: 1892 mutex_enter(&connp->conn_lock); 1893 *i1 = (int)icmp->icmp_hdrincl; 1894 mutex_exit(&connp->conn_lock); 1895 return (sizeof (int)); 1896 } 1897 break; 1898 1899 case IPPROTO_IPV6: 1900 /* 1901 * Only allow IPv6 option processing on native IPv6 sockets. 1902 */ 1903 if (connp->conn_family != AF_INET6) 1904 return (-1); 1905 1906 switch (name) { 1907 case IPV6_CHECKSUM: 1908 /* 1909 * Return offset or -1 if no checksum offset. 1910 * Does not apply to IPPROTO_ICMPV6 1911 */ 1912 if (connp->conn_proto == IPPROTO_ICMPV6) 1913 return (-1); 1914 1915 mutex_enter(&connp->conn_lock); 1916 if (connp->conn_ixa->ixa_flags & IXAF_SET_RAW_CKSUM) 1917 *i1 = connp->conn_ixa->ixa_raw_cksum_offset; 1918 else 1919 *i1 = -1; 1920 mutex_exit(&connp->conn_lock); 1921 return (sizeof (int)); 1922 } 1923 break; 1924 1925 case IPPROTO_ICMPV6: 1926 /* 1927 * Only allow IPv6 option processing on native IPv6 sockets. 1928 */ 1929 if (connp->conn_family != AF_INET6) 1930 return (-1); 1931 1932 if (connp->conn_proto != IPPROTO_ICMPV6) 1933 return (-1); 1934 1935 switch (name) { 1936 case ICMP6_FILTER: 1937 mutex_enter(&connp->conn_lock); 1938 if (icmp->icmp_filter == NULL) { 1939 /* Make it look like "pass all" */ 1940 ICMP6_FILTER_SETPASSALL((icmp6_filter_t *)ptr); 1941 } else { 1942 (void) bcopy(icmp->icmp_filter, ptr, 1943 sizeof (icmp6_filter_t)); 1944 } 1945 mutex_exit(&connp->conn_lock); 1946 return (sizeof (icmp6_filter_t)); 1947 } 1948 } 1949 mutex_enter(&connp->conn_lock); 1950 retval = conn_opt_get(&coas, level, name, ptr); 1951 mutex_exit(&connp->conn_lock); 1952 return (retval); 1953 } 1954 1955 /* 1956 * This routine retrieves the current status of socket options. 1957 * It returns the size of the option retrieved, or -1. 1958 */ 1959 int 1960 icmp_tpi_opt_get(queue_t *q, int level, int name, uchar_t *ptr) 1961 { 1962 conn_t *connp = Q_TO_CONN(q); 1963 int err; 1964 1965 err = icmp_opt_get(connp, level, name, ptr); 1966 return (err); 1967 } 1968 1969 /* 1970 * This routine sets socket options. 1971 */ 1972 int 1973 icmp_do_opt_set(conn_opt_arg_t *coa, int level, int name, 1974 uint_t inlen, uchar_t *invalp, cred_t *cr, boolean_t checkonly) 1975 { 1976 conn_t *connp = coa->coa_connp; 1977 ip_xmit_attr_t *ixa = coa->coa_ixa; 1978 icmp_t *icmp = connp->conn_icmp; 1979 icmp_stack_t *is = icmp->icmp_is; 1980 int *i1 = (int *)invalp; 1981 boolean_t onoff = (*i1 == 0) ? 0 : 1; 1982 int error; 1983 1984 ASSERT(MUTEX_NOT_HELD(&coa->coa_connp->conn_lock)); 1985 1986 /* 1987 * For fixed length options, no sanity check 1988 * of passed in length is done. It is assumed *_optcom_req() 1989 * routines do the right thing. 1990 */ 1991 1992 switch (level) { 1993 case SOL_SOCKET: 1994 switch (name) { 1995 case SO_PROTOTYPE: 1996 if ((*i1 & 0xFF) != IPPROTO_ICMP && 1997 (*i1 & 0xFF) != IPPROTO_ICMPV6 && 1998 secpolicy_net_rawaccess(cr) != 0) { 1999 return (EACCES); 2000 } 2001 if (checkonly) 2002 break; 2003 2004 mutex_enter(&connp->conn_lock); 2005 connp->conn_proto = *i1 & 0xFF; 2006 ixa->ixa_protocol = connp->conn_proto; 2007 if ((connp->conn_proto == IPPROTO_RAW || 2008 connp->conn_proto == IPPROTO_IGMP) && 2009 connp->conn_family == AF_INET) { 2010 icmp->icmp_hdrincl = 1; 2011 ixa->ixa_flags &= ~IXAF_SET_ULP_CKSUM; 2012 } else if (connp->conn_proto == IPPROTO_UDP || 2013 connp->conn_proto == IPPROTO_TCP || 2014 connp->conn_proto == IPPROTO_SCTP) { 2015 /* Used by test applications like psh */ 2016 icmp->icmp_hdrincl = 0; 2017 ixa->ixa_flags &= ~IXAF_SET_ULP_CKSUM; 2018 } else { 2019 icmp->icmp_hdrincl = 0; 2020 ixa->ixa_flags |= IXAF_SET_ULP_CKSUM; 2021 } 2022 2023 if (connp->conn_family == AF_INET6 && 2024 connp->conn_proto == IPPROTO_ICMPV6) { 2025 /* Set offset for icmp6_cksum */ 2026 ixa->ixa_flags &= ~IXAF_SET_RAW_CKSUM; 2027 ixa->ixa_raw_cksum_offset = 2; 2028 } 2029 if (icmp->icmp_filter != NULL && 2030 connp->conn_proto != IPPROTO_ICMPV6) { 2031 kmem_free(icmp->icmp_filter, 2032 sizeof (icmp6_filter_t)); 2033 icmp->icmp_filter = NULL; 2034 } 2035 mutex_exit(&connp->conn_lock); 2036 2037 coa->coa_changed |= COA_HEADER_CHANGED; 2038 /* 2039 * For SCTP, we don't use icmp_bind_proto() for 2040 * raw socket binding. 2041 */ 2042 if (connp->conn_proto == IPPROTO_SCTP) 2043 return (0); 2044 2045 coa->coa_changed |= COA_ICMP_BIND_NEEDED; 2046 return (0); 2047 2048 case SO_SNDBUF: 2049 if (*i1 > is->is_max_buf) { 2050 return (ENOBUFS); 2051 } 2052 break; 2053 case SO_RCVBUF: 2054 if (*i1 > is->is_max_buf) { 2055 return (ENOBUFS); 2056 } 2057 break; 2058 } 2059 break; 2060 2061 case IPPROTO_IP: 2062 /* 2063 * Only allow IPv4 option processing on IPv4 sockets. 2064 */ 2065 if (connp->conn_family != AF_INET) 2066 return (EINVAL); 2067 2068 switch (name) { 2069 case IP_HDRINCL: 2070 if (!checkonly) { 2071 mutex_enter(&connp->conn_lock); 2072 icmp->icmp_hdrincl = onoff; 2073 if (onoff) 2074 ixa->ixa_flags &= ~IXAF_SET_ULP_CKSUM; 2075 else 2076 ixa->ixa_flags |= IXAF_SET_ULP_CKSUM; 2077 mutex_exit(&connp->conn_lock); 2078 } 2079 break; 2080 } 2081 break; 2082 2083 case IPPROTO_IPV6: 2084 if (connp->conn_family != AF_INET6) 2085 return (EINVAL); 2086 2087 switch (name) { 2088 case IPV6_CHECKSUM: 2089 /* 2090 * Integer offset into the user data of where the 2091 * checksum is located. 2092 * Offset of -1 disables option. 2093 * Does not apply to IPPROTO_ICMPV6. 2094 */ 2095 if (connp->conn_proto == IPPROTO_ICMPV6 || 2096 coa->coa_ancillary) { 2097 return (EINVAL); 2098 } 2099 if ((*i1 != -1) && ((*i1 < 0) || (*i1 & 0x1) != 0)) { 2100 /* Negative or not 16 bit aligned offset */ 2101 return (EINVAL); 2102 } 2103 if (checkonly) 2104 break; 2105 2106 mutex_enter(&connp->conn_lock); 2107 if (*i1 == -1) { 2108 ixa->ixa_flags &= ~IXAF_SET_RAW_CKSUM; 2109 ixa->ixa_raw_cksum_offset = 0; 2110 ixa->ixa_flags &= ~IXAF_SET_ULP_CKSUM; 2111 } else { 2112 ixa->ixa_flags |= IXAF_SET_RAW_CKSUM; 2113 ixa->ixa_raw_cksum_offset = *i1; 2114 ixa->ixa_flags |= IXAF_SET_ULP_CKSUM; 2115 } 2116 mutex_exit(&connp->conn_lock); 2117 break; 2118 } 2119 break; 2120 2121 case IPPROTO_ICMPV6: 2122 /* 2123 * Only allow IPv6 option processing on IPv6 sockets. 2124 */ 2125 if (connp->conn_family != AF_INET6) 2126 return (EINVAL); 2127 if (connp->conn_proto != IPPROTO_ICMPV6) 2128 return (EINVAL); 2129 2130 switch (name) { 2131 case ICMP6_FILTER: 2132 if (checkonly) 2133 break; 2134 2135 if ((inlen != 0) && 2136 (inlen != sizeof (icmp6_filter_t))) 2137 return (EINVAL); 2138 2139 mutex_enter(&connp->conn_lock); 2140 if (inlen == 0) { 2141 if (icmp->icmp_filter != NULL) { 2142 kmem_free(icmp->icmp_filter, 2143 sizeof (icmp6_filter_t)); 2144 icmp->icmp_filter = NULL; 2145 } 2146 } else { 2147 if (icmp->icmp_filter == NULL) { 2148 icmp->icmp_filter = kmem_alloc( 2149 sizeof (icmp6_filter_t), 2150 KM_NOSLEEP); 2151 if (icmp->icmp_filter == NULL) { 2152 mutex_exit(&connp->conn_lock); 2153 return (ENOBUFS); 2154 } 2155 } 2156 (void) bcopy(invalp, icmp->icmp_filter, inlen); 2157 } 2158 mutex_exit(&connp->conn_lock); 2159 break; 2160 } 2161 break; 2162 } 2163 error = conn_opt_set(coa, level, name, inlen, invalp, 2164 checkonly, cr); 2165 return (error); 2166 } 2167 2168 /* 2169 * This routine sets socket options. 2170 */ 2171 int 2172 icmp_opt_set(conn_t *connp, uint_t optset_context, int level, int name, 2173 uint_t inlen, uchar_t *invalp, uint_t *outlenp, uchar_t *outvalp, 2174 void *thisdg_attrs, cred_t *cr) 2175 { 2176 icmp_t *icmp = connp->conn_icmp; 2177 int err; 2178 conn_opt_arg_t coas, *coa; 2179 boolean_t checkonly; 2180 icmp_stack_t *is = icmp->icmp_is; 2181 2182 switch (optset_context) { 2183 case SETFN_OPTCOM_CHECKONLY: 2184 checkonly = B_TRUE; 2185 /* 2186 * Note: Implies T_CHECK semantics for T_OPTCOM_REQ 2187 * inlen != 0 implies value supplied and 2188 * we have to "pretend" to set it. 2189 * inlen == 0 implies that there is no 2190 * value part in T_CHECK request and just validation 2191 * done elsewhere should be enough, we just return here. 2192 */ 2193 if (inlen == 0) { 2194 *outlenp = 0; 2195 return (0); 2196 } 2197 break; 2198 case SETFN_OPTCOM_NEGOTIATE: 2199 checkonly = B_FALSE; 2200 break; 2201 case SETFN_UD_NEGOTIATE: 2202 case SETFN_CONN_NEGOTIATE: 2203 checkonly = B_FALSE; 2204 /* 2205 * Negotiating local and "association-related" options 2206 * through T_UNITDATA_REQ. 2207 * 2208 * Following routine can filter out ones we do not 2209 * want to be "set" this way. 2210 */ 2211 if (!icmp_opt_allow_udr_set(level, name)) { 2212 *outlenp = 0; 2213 return (EINVAL); 2214 } 2215 break; 2216 default: 2217 /* 2218 * We should never get here 2219 */ 2220 *outlenp = 0; 2221 return (EINVAL); 2222 } 2223 2224 ASSERT((optset_context != SETFN_OPTCOM_CHECKONLY) || 2225 (optset_context == SETFN_OPTCOM_CHECKONLY && inlen != 0)); 2226 2227 if (thisdg_attrs != NULL) { 2228 /* Options from T_UNITDATA_REQ */ 2229 coa = (conn_opt_arg_t *)thisdg_attrs; 2230 ASSERT(coa->coa_connp == connp); 2231 ASSERT(coa->coa_ixa != NULL); 2232 ASSERT(coa->coa_ipp != NULL); 2233 ASSERT(coa->coa_ancillary); 2234 } else { 2235 coa = &coas; 2236 coas.coa_connp = connp; 2237 /* Get a reference on conn_ixa to prevent concurrent mods */ 2238 coas.coa_ixa = conn_get_ixa(connp, B_TRUE); 2239 if (coas.coa_ixa == NULL) { 2240 *outlenp = 0; 2241 return (ENOMEM); 2242 } 2243 coas.coa_ipp = &connp->conn_xmit_ipp; 2244 coas.coa_ancillary = B_FALSE; 2245 coas.coa_changed = 0; 2246 } 2247 2248 err = icmp_do_opt_set(coa, level, name, inlen, invalp, 2249 cr, checkonly); 2250 if (err != 0) { 2251 errout: 2252 if (!coa->coa_ancillary) 2253 ixa_refrele(coa->coa_ixa); 2254 *outlenp = 0; 2255 return (err); 2256 } 2257 2258 /* 2259 * Common case of OK return with outval same as inval. 2260 */ 2261 if (invalp != outvalp) { 2262 /* don't trust bcopy for identical src/dst */ 2263 (void) bcopy(invalp, outvalp, inlen); 2264 } 2265 *outlenp = inlen; 2266 2267 /* 2268 * If this was not ancillary data, then we rebuild the headers, 2269 * update the IRE/NCE, and IPsec as needed. 2270 * Since the label depends on the destination we go through 2271 * ip_set_destination first. 2272 */ 2273 if (coa->coa_ancillary) { 2274 return (0); 2275 } 2276 2277 if (coa->coa_changed & COA_ROUTE_CHANGED) { 2278 in6_addr_t saddr, faddr, nexthop; 2279 in_port_t fport; 2280 2281 /* 2282 * We clear lastdst to make sure we pick up the change 2283 * next time sending. 2284 * If we are connected we re-cache the information. 2285 * We ignore errors to preserve BSD behavior. 2286 * Note that we don't redo IPsec policy lookup here 2287 * since the final destination (or source) didn't change. 2288 */ 2289 mutex_enter(&connp->conn_lock); 2290 connp->conn_v6lastdst = ipv6_all_zeros; 2291 2292 ip_attr_nexthop(coa->coa_ipp, coa->coa_ixa, 2293 &connp->conn_faddr_v6, &nexthop); 2294 saddr = connp->conn_saddr_v6; 2295 faddr = connp->conn_faddr_v6; 2296 fport = connp->conn_fport; 2297 mutex_exit(&connp->conn_lock); 2298 2299 if (!IN6_IS_ADDR_UNSPECIFIED(&faddr) && 2300 !IN6_IS_ADDR_V4MAPPED_ANY(&faddr)) { 2301 (void) ip_attr_connect(connp, coa->coa_ixa, 2302 &saddr, &faddr, &nexthop, fport, NULL, NULL, 2303 IPDF_ALLOW_MCBC | IPDF_VERIFY_DST); 2304 } 2305 } 2306 2307 ixa_refrele(coa->coa_ixa); 2308 2309 if (coa->coa_changed & COA_HEADER_CHANGED) { 2310 /* 2311 * Rebuild the header template if we are connected. 2312 * Otherwise clear conn_v6lastdst so we rebuild the header 2313 * in the data path. 2314 */ 2315 mutex_enter(&connp->conn_lock); 2316 if (!IN6_IS_ADDR_UNSPECIFIED(&connp->conn_faddr_v6) && 2317 !IN6_IS_ADDR_V4MAPPED_ANY(&connp->conn_faddr_v6)) { 2318 err = icmp_build_hdr_template(connp, 2319 &connp->conn_saddr_v6, &connp->conn_faddr_v6, 2320 connp->conn_flowinfo); 2321 if (err != 0) { 2322 mutex_exit(&connp->conn_lock); 2323 return (err); 2324 } 2325 } else { 2326 connp->conn_v6lastdst = ipv6_all_zeros; 2327 } 2328 mutex_exit(&connp->conn_lock); 2329 } 2330 if (coa->coa_changed & COA_RCVBUF_CHANGED) { 2331 (void) proto_set_rx_hiwat(connp->conn_rq, connp, 2332 connp->conn_rcvbuf); 2333 } 2334 if ((coa->coa_changed & COA_SNDBUF_CHANGED) && !IPCL_IS_NONSTR(connp)) { 2335 connp->conn_wq->q_hiwat = connp->conn_sndbuf; 2336 } 2337 if (coa->coa_changed & COA_WROFF_CHANGED) { 2338 /* Increase wroff if needed */ 2339 uint_t wroff; 2340 2341 mutex_enter(&connp->conn_lock); 2342 wroff = connp->conn_ht_iphc_allocated + is->is_wroff_extra; 2343 if (wroff > connp->conn_wroff) { 2344 connp->conn_wroff = wroff; 2345 mutex_exit(&connp->conn_lock); 2346 (void) proto_set_tx_wroff(connp->conn_rq, connp, wroff); 2347 } else { 2348 mutex_exit(&connp->conn_lock); 2349 } 2350 } 2351 if (coa->coa_changed & COA_ICMP_BIND_NEEDED) { 2352 icmp_bind_proto(icmp); 2353 } 2354 return (err); 2355 } 2356 2357 /* This routine sets socket options. */ 2358 int 2359 icmp_tpi_opt_set(queue_t *q, uint_t optset_context, int level, int name, 2360 uint_t inlen, uchar_t *invalp, uint_t *outlenp, uchar_t *outvalp, 2361 void *thisdg_attrs, cred_t *cr) 2362 { 2363 conn_t *connp = Q_TO_CONN(q); 2364 int error; 2365 2366 error = icmp_opt_set(connp, optset_context, level, name, inlen, invalp, 2367 outlenp, outvalp, thisdg_attrs, cr); 2368 return (error); 2369 } 2370 2371 /* 2372 * Setup IP headers. 2373 * 2374 * Note that IP_HDRINCL has ipha_protocol that is different than conn_proto, 2375 * but icmp_output_hdrincl restores ipha_protocol once we return. 2376 */ 2377 mblk_t * 2378 icmp_prepend_hdr(conn_t *connp, ip_xmit_attr_t *ixa, const ip_pkt_t *ipp, 2379 const in6_addr_t *v6src, const in6_addr_t *v6dst, uint32_t flowinfo, 2380 mblk_t *data_mp, int *errorp) 2381 { 2382 mblk_t *mp; 2383 icmp_stack_t *is = connp->conn_netstack->netstack_icmp; 2384 uint_t data_len; 2385 uint32_t cksum; 2386 2387 data_len = msgdsize(data_mp); 2388 mp = conn_prepend_hdr(ixa, ipp, v6src, v6dst, connp->conn_proto, 2389 flowinfo, 0, data_mp, data_len, is->is_wroff_extra, &cksum, errorp); 2390 if (mp == NULL) { 2391 ASSERT(*errorp != 0); 2392 return (NULL); 2393 } 2394 2395 ixa->ixa_pktlen = data_len + ixa->ixa_ip_hdr_length; 2396 2397 /* 2398 * If there was a routing option/header then conn_prepend_hdr 2399 * has massaged it and placed the pseudo-header checksum difference 2400 * in the cksum argument. 2401 * 2402 * Prepare for ICMPv6 checksum done in IP. 2403 * 2404 * We make it easy for IP to include our pseudo header 2405 * by putting our length (and any routing header adjustment) 2406 * in the ICMPv6 checksum field. 2407 * The IP source, destination, and length have already been set by 2408 * conn_prepend_hdr. 2409 */ 2410 cksum += data_len; 2411 cksum = (cksum >> 16) + (cksum & 0xFFFF); 2412 ASSERT(cksum < 0x10000); 2413 2414 if (ixa->ixa_flags & IXAF_IS_IPV4) { 2415 ipha_t *ipha = (ipha_t *)mp->b_rptr; 2416 2417 ASSERT(ntohs(ipha->ipha_length) == ixa->ixa_pktlen); 2418 } else { 2419 ip6_t *ip6h = (ip6_t *)mp->b_rptr; 2420 uint_t cksum_offset = 0; 2421 2422 ASSERT(ntohs(ip6h->ip6_plen) + IPV6_HDR_LEN == ixa->ixa_pktlen); 2423 2424 if (ixa->ixa_flags & IXAF_SET_ULP_CKSUM) { 2425 if (connp->conn_proto == IPPROTO_ICMPV6) { 2426 cksum_offset = ixa->ixa_ip_hdr_length + 2427 offsetof(icmp6_t, icmp6_cksum); 2428 } else if (ixa->ixa_flags & IXAF_SET_RAW_CKSUM) { 2429 cksum_offset = ixa->ixa_ip_hdr_length + 2430 ixa->ixa_raw_cksum_offset; 2431 } 2432 } 2433 if (cksum_offset != 0) { 2434 uint16_t *ptr; 2435 2436 /* Make sure the checksum fits in the first mblk */ 2437 if (cksum_offset + sizeof (short) > MBLKL(mp)) { 2438 mblk_t *mp1; 2439 2440 mp1 = msgpullup(mp, 2441 cksum_offset + sizeof (short)); 2442 freemsg(mp); 2443 if (mp1 == NULL) { 2444 *errorp = ENOMEM; 2445 return (NULL); 2446 } 2447 mp = mp1; 2448 ip6h = (ip6_t *)mp->b_rptr; 2449 } 2450 ptr = (uint16_t *)(mp->b_rptr + cksum_offset); 2451 *ptr = htons(cksum); 2452 } 2453 } 2454 2455 /* Note that we don't try to update wroff due to ancillary data */ 2456 return (mp); 2457 } 2458 2459 static int 2460 icmp_build_hdr_template(conn_t *connp, const in6_addr_t *v6src, 2461 const in6_addr_t *v6dst, uint32_t flowinfo) 2462 { 2463 int error; 2464 2465 ASSERT(MUTEX_HELD(&connp->conn_lock)); 2466 /* 2467 * We clear lastdst to make sure we don't use the lastdst path 2468 * next time sending since we might not have set v6dst yet. 2469 */ 2470 connp->conn_v6lastdst = ipv6_all_zeros; 2471 2472 error = conn_build_hdr_template(connp, 0, 0, v6src, v6dst, flowinfo); 2473 if (error != 0) 2474 return (error); 2475 2476 /* 2477 * Any routing header/option has been massaged. The checksum difference 2478 * is stored in conn_sum. 2479 */ 2480 return (0); 2481 } 2482 2483 static mblk_t * 2484 icmp_queue_fallback(icmp_t *icmp, mblk_t *mp) 2485 { 2486 ASSERT(MUTEX_HELD(&icmp->icmp_recv_lock)); 2487 if (IPCL_IS_NONSTR(icmp->icmp_connp)) { 2488 /* 2489 * fallback has started but messages have not been moved yet 2490 */ 2491 if (icmp->icmp_fallback_queue_head == NULL) { 2492 ASSERT(icmp->icmp_fallback_queue_tail == NULL); 2493 icmp->icmp_fallback_queue_head = mp; 2494 icmp->icmp_fallback_queue_tail = mp; 2495 } else { 2496 ASSERT(icmp->icmp_fallback_queue_tail != NULL); 2497 icmp->icmp_fallback_queue_tail->b_next = mp; 2498 icmp->icmp_fallback_queue_tail = mp; 2499 } 2500 return (NULL); 2501 } else { 2502 /* 2503 * Fallback completed, let the caller putnext() the mblk. 2504 */ 2505 return (mp); 2506 } 2507 } 2508 2509 /* 2510 * Deliver data to ULP. In case we have a socket, and it's falling back to 2511 * TPI, then we'll queue the mp for later processing. 2512 */ 2513 static void 2514 icmp_ulp_recv(conn_t *connp, mblk_t *mp, uint_t len) 2515 { 2516 if (IPCL_IS_NONSTR(connp)) { 2517 icmp_t *icmp = connp->conn_icmp; 2518 int error; 2519 2520 ASSERT(len == msgdsize(mp)); 2521 if ((*connp->conn_upcalls->su_recv) 2522 (connp->conn_upper_handle, mp, len, 0, &error, NULL) < 0) { 2523 mutex_enter(&icmp->icmp_recv_lock); 2524 if (error == ENOSPC) { 2525 /* 2526 * let's confirm while holding the lock 2527 */ 2528 if ((*connp->conn_upcalls->su_recv) 2529 (connp->conn_upper_handle, NULL, 0, 0, 2530 &error, NULL) < 0) { 2531 ASSERT(error == ENOSPC); 2532 if (error == ENOSPC) { 2533 connp->conn_flow_cntrld = 2534 B_TRUE; 2535 } 2536 } 2537 mutex_exit(&icmp->icmp_recv_lock); 2538 } else { 2539 ASSERT(error == EOPNOTSUPP); 2540 mp = icmp_queue_fallback(icmp, mp); 2541 mutex_exit(&icmp->icmp_recv_lock); 2542 if (mp != NULL) 2543 putnext(connp->conn_rq, mp); 2544 } 2545 } 2546 ASSERT(MUTEX_NOT_HELD(&icmp->icmp_recv_lock)); 2547 } else { 2548 putnext(connp->conn_rq, mp); 2549 } 2550 } 2551 2552 /* 2553 * This is the inbound data path. 2554 * IP has already pulled up the IP headers and verified alignment 2555 * etc. 2556 */ 2557 /* ARGSUSED2 */ 2558 static void 2559 icmp_input(void *arg1, mblk_t *mp, void *arg2, ip_recv_attr_t *ira) 2560 { 2561 conn_t *connp = (conn_t *)arg1; 2562 struct T_unitdata_ind *tudi; 2563 uchar_t *rptr; /* Pointer to IP header */ 2564 int ip_hdr_length; 2565 int udi_size; /* Size of T_unitdata_ind */ 2566 int pkt_len; 2567 icmp_t *icmp; 2568 ip_pkt_t ipps; 2569 ip6_t *ip6h; 2570 mblk_t *mp1; 2571 crb_t recv_ancillary; 2572 icmp_stack_t *is; 2573 sin_t *sin; 2574 sin6_t *sin6; 2575 ipha_t *ipha; 2576 2577 ASSERT(connp->conn_flags & IPCL_RAWIPCONN); 2578 2579 icmp = connp->conn_icmp; 2580 is = icmp->icmp_is; 2581 rptr = mp->b_rptr; 2582 2583 ASSERT(DB_TYPE(mp) == M_DATA); 2584 ASSERT(OK_32PTR(rptr)); 2585 ASSERT(ira->ira_pktlen == msgdsize(mp)); 2586 pkt_len = ira->ira_pktlen; 2587 2588 /* 2589 * Get a snapshot of these and allow other threads to change 2590 * them after that. We need the same recv_ancillary when determining 2591 * the size as when adding the ancillary data items. 2592 */ 2593 mutex_enter(&connp->conn_lock); 2594 recv_ancillary = connp->conn_recv_ancillary; 2595 mutex_exit(&connp->conn_lock); 2596 2597 ip_hdr_length = ira->ira_ip_hdr_length; 2598 ASSERT(MBLKL(mp) >= ip_hdr_length); /* IP did a pullup */ 2599 2600 /* Initialize regardless of IP version */ 2601 ipps.ipp_fields = 0; 2602 2603 if (ira->ira_flags & IRAF_IS_IPV4) { 2604 ASSERT(IPH_HDR_VERSION(rptr) == IPV4_VERSION); 2605 ASSERT(MBLKL(mp) >= sizeof (ipha_t)); 2606 ASSERT(ira->ira_ip_hdr_length == IPH_HDR_LENGTH(rptr)); 2607 2608 ipha = (ipha_t *)mp->b_rptr; 2609 if (recv_ancillary.crb_all != 0) 2610 (void) ip_find_hdr_v4(ipha, &ipps, B_FALSE); 2611 2612 /* 2613 * BSD for some reason adjusts ipha_length to exclude the 2614 * IP header length. We do the same. 2615 */ 2616 if (is->is_bsd_compat) { 2617 ushort_t len; 2618 2619 len = ntohs(ipha->ipha_length); 2620 if (mp->b_datap->db_ref > 1) { 2621 /* 2622 * Allocate a new IP header so that we can 2623 * modify ipha_length. 2624 */ 2625 mblk_t *mp1; 2626 2627 mp1 = allocb(ip_hdr_length, BPRI_MED); 2628 if (mp1 == NULL) { 2629 freemsg(mp); 2630 BUMP_MIB(&is->is_rawip_mib, 2631 rawipInErrors); 2632 return; 2633 } 2634 bcopy(rptr, mp1->b_rptr, ip_hdr_length); 2635 mp->b_rptr = rptr + ip_hdr_length; 2636 rptr = mp1->b_rptr; 2637 ipha = (ipha_t *)rptr; 2638 mp1->b_cont = mp; 2639 mp1->b_wptr = rptr + ip_hdr_length; 2640 mp = mp1; 2641 } 2642 len -= ip_hdr_length; 2643 ipha->ipha_length = htons(len); 2644 } 2645 2646 /* 2647 * For RAW sockets we not pass ICMP/IPv4 packets to AF_INET6 2648 * sockets. This is ensured by icmp_bind and the IP fanout code. 2649 */ 2650 ASSERT(connp->conn_family == AF_INET); 2651 2652 /* 2653 * This is the inbound data path. Packets are passed upstream 2654 * as T_UNITDATA_IND messages with full IPv4 headers still 2655 * attached. 2656 */ 2657 2658 /* 2659 * Normally only send up the source address. 2660 * If any ancillary data items are wanted we add those. 2661 */ 2662 udi_size = sizeof (struct T_unitdata_ind) + sizeof (sin_t); 2663 if (recv_ancillary.crb_all != 0) { 2664 udi_size += conn_recvancillary_size(connp, 2665 recv_ancillary, ira, mp, &ipps); 2666 } 2667 2668 /* Allocate a message block for the T_UNITDATA_IND structure. */ 2669 mp1 = allocb(udi_size, BPRI_MED); 2670 if (mp1 == NULL) { 2671 freemsg(mp); 2672 BUMP_MIB(&is->is_rawip_mib, rawipInErrors); 2673 return; 2674 } 2675 mp1->b_cont = mp; 2676 tudi = (struct T_unitdata_ind *)mp1->b_rptr; 2677 mp1->b_datap->db_type = M_PROTO; 2678 mp1->b_wptr = (uchar_t *)tudi + udi_size; 2679 tudi->PRIM_type = T_UNITDATA_IND; 2680 tudi->SRC_length = sizeof (sin_t); 2681 tudi->SRC_offset = sizeof (struct T_unitdata_ind); 2682 sin = (sin_t *)&tudi[1]; 2683 *sin = sin_null; 2684 sin->sin_family = AF_INET; 2685 sin->sin_addr.s_addr = ipha->ipha_src; 2686 *(uint32_t *)&sin->sin_zero[0] = 0; 2687 *(uint32_t *)&sin->sin_zero[4] = 0; 2688 tudi->OPT_offset = sizeof (struct T_unitdata_ind) + 2689 sizeof (sin_t); 2690 udi_size -= (sizeof (struct T_unitdata_ind) + sizeof (sin_t)); 2691 tudi->OPT_length = udi_size; 2692 2693 /* 2694 * Add options if IP_RECVIF etc is set 2695 */ 2696 if (udi_size != 0) { 2697 conn_recvancillary_add(connp, recv_ancillary, ira, 2698 &ipps, (uchar_t *)&sin[1], udi_size); 2699 } 2700 goto deliver; 2701 } 2702 2703 ASSERT(IPH_HDR_VERSION(rptr) == IPV6_VERSION); 2704 /* 2705 * IPv6 packets can only be received by applications 2706 * that are prepared to receive IPv6 addresses. 2707 * The IP fanout must ensure this. 2708 */ 2709 ASSERT(connp->conn_family == AF_INET6); 2710 2711 /* 2712 * Handle IPv6 packets. We don't pass up the IP headers with the 2713 * payload for IPv6. 2714 */ 2715 2716 ip6h = (ip6_t *)rptr; 2717 if (recv_ancillary.crb_all != 0) { 2718 /* 2719 * Call on ip_find_hdr_v6 which gets individual lenghts of 2720 * extension headers (and pointers to them). 2721 */ 2722 uint8_t nexthdr; 2723 2724 /* We don't care about the length or nextheader. */ 2725 (void) ip_find_hdr_v6(mp, ip6h, B_TRUE, &ipps, &nexthdr); 2726 2727 /* 2728 * We do not pass up hop-by-hop options or any other 2729 * extension header as part of the packet. Applications 2730 * that want to see them have to specify IPV6_RECV* socket 2731 * options. And conn_recvancillary_size/add explicitly 2732 * drops the TX option from IPV6_HOPOPTS as it does for UDP. 2733 * 2734 * If we had multilevel ICMP sockets, then we'd want to 2735 * modify conn_recvancillary_size/add to 2736 * allow the user to see the label. 2737 */ 2738 } 2739 2740 /* 2741 * Check a filter for ICMPv6 types if needed. 2742 * Verify raw checksums if needed. 2743 */ 2744 mutex_enter(&connp->conn_lock); 2745 if (icmp->icmp_filter != NULL) { 2746 int type; 2747 2748 /* Assumes that IP has done the pullupmsg */ 2749 type = mp->b_rptr[ip_hdr_length]; 2750 2751 ASSERT(mp->b_rptr + ip_hdr_length <= mp->b_wptr); 2752 if (ICMP6_FILTER_WILLBLOCK(type, icmp->icmp_filter)) { 2753 mutex_exit(&connp->conn_lock); 2754 freemsg(mp); 2755 return; 2756 } 2757 } 2758 if (connp->conn_ixa->ixa_flags & IXAF_SET_RAW_CKSUM) { 2759 /* Checksum */ 2760 uint16_t *up; 2761 uint32_t sum; 2762 int remlen; 2763 2764 up = (uint16_t *)&ip6h->ip6_src; 2765 2766 remlen = msgdsize(mp) - ip_hdr_length; 2767 sum = htons(connp->conn_proto + remlen) 2768 + up[0] + up[1] + up[2] + up[3] 2769 + up[4] + up[5] + up[6] + up[7] 2770 + up[8] + up[9] + up[10] + up[11] 2771 + up[12] + up[13] + up[14] + up[15]; 2772 sum = (sum & 0xffff) + (sum >> 16); 2773 sum = IP_CSUM(mp, ip_hdr_length, sum); 2774 if (sum != 0) { 2775 /* IPv6 RAW checksum failed */ 2776 ip0dbg(("icmp_rput: RAW checksum failed %x\n", sum)); 2777 mutex_exit(&connp->conn_lock); 2778 freemsg(mp); 2779 BUMP_MIB(&is->is_rawip_mib, rawipInCksumErrs); 2780 return; 2781 } 2782 } 2783 mutex_exit(&connp->conn_lock); 2784 2785 udi_size = sizeof (struct T_unitdata_ind) + sizeof (sin6_t); 2786 2787 if (recv_ancillary.crb_all != 0) { 2788 udi_size += conn_recvancillary_size(connp, 2789 recv_ancillary, ira, mp, &ipps); 2790 } 2791 2792 mp1 = allocb(udi_size, BPRI_MED); 2793 if (mp1 == NULL) { 2794 freemsg(mp); 2795 BUMP_MIB(&is->is_rawip_mib, rawipInErrors); 2796 return; 2797 } 2798 mp1->b_cont = mp; 2799 mp1->b_datap->db_type = M_PROTO; 2800 tudi = (struct T_unitdata_ind *)mp1->b_rptr; 2801 mp1->b_wptr = (uchar_t *)tudi + udi_size; 2802 tudi->PRIM_type = T_UNITDATA_IND; 2803 tudi->SRC_length = sizeof (sin6_t); 2804 tudi->SRC_offset = sizeof (struct T_unitdata_ind); 2805 tudi->OPT_offset = sizeof (struct T_unitdata_ind) + sizeof (sin6_t); 2806 udi_size -= (sizeof (struct T_unitdata_ind) + sizeof (sin6_t)); 2807 tudi->OPT_length = udi_size; 2808 sin6 = (sin6_t *)&tudi[1]; 2809 *sin6 = sin6_null; 2810 sin6->sin6_port = 0; 2811 sin6->sin6_family = AF_INET6; 2812 2813 sin6->sin6_addr = ip6h->ip6_src; 2814 /* No sin6_flowinfo per API */ 2815 sin6->sin6_flowinfo = 0; 2816 /* For link-scope pass up scope id */ 2817 if (IN6_IS_ADDR_LINKSCOPE(&ip6h->ip6_src)) 2818 sin6->sin6_scope_id = ira->ira_ruifindex; 2819 else 2820 sin6->sin6_scope_id = 0; 2821 sin6->__sin6_src_id = ip_srcid_find_addr(&ip6h->ip6_dst, 2822 IPCL_ZONEID(connp), is->is_netstack); 2823 2824 if (udi_size != 0) { 2825 conn_recvancillary_add(connp, recv_ancillary, ira, 2826 &ipps, (uchar_t *)&sin6[1], udi_size); 2827 } 2828 2829 /* Skip all the IPv6 headers per API */ 2830 mp->b_rptr += ip_hdr_length; 2831 pkt_len -= ip_hdr_length; 2832 2833 deliver: 2834 BUMP_MIB(&is->is_rawip_mib, rawipInDatagrams); 2835 icmp_ulp_recv(connp, mp1, pkt_len); 2836 } 2837 2838 /* 2839 * return SNMP stuff in buffer in mpdata. We don't hold any lock and report 2840 * information that can be changing beneath us. 2841 */ 2842 mblk_t * 2843 icmp_snmp_get(queue_t *q, mblk_t *mpctl) 2844 { 2845 mblk_t *mpdata; 2846 struct opthdr *optp; 2847 conn_t *connp = Q_TO_CONN(q); 2848 icmp_stack_t *is = connp->conn_netstack->netstack_icmp; 2849 mblk_t *mp2ctl; 2850 2851 /* 2852 * make a copy of the original message 2853 */ 2854 mp2ctl = copymsg(mpctl); 2855 2856 if (mpctl == NULL || 2857 (mpdata = mpctl->b_cont) == NULL) { 2858 freemsg(mpctl); 2859 freemsg(mp2ctl); 2860 return (0); 2861 } 2862 2863 /* fixed length structure for IPv4 and IPv6 counters */ 2864 optp = (struct opthdr *)&mpctl->b_rptr[sizeof (struct T_optmgmt_ack)]; 2865 optp->level = EXPER_RAWIP; 2866 optp->name = 0; 2867 (void) snmp_append_data(mpdata, (char *)&is->is_rawip_mib, 2868 sizeof (is->is_rawip_mib)); 2869 optp->len = msgdsize(mpdata); 2870 qreply(q, mpctl); 2871 2872 return (mp2ctl); 2873 } 2874 2875 /* 2876 * Return 0 if invalid set request, 1 otherwise, including non-rawip requests. 2877 * TODO: If this ever actually tries to set anything, it needs to be 2878 * to do the appropriate locking. 2879 */ 2880 /* ARGSUSED */ 2881 int 2882 icmp_snmp_set(queue_t *q, t_scalar_t level, t_scalar_t name, 2883 uchar_t *ptr, int len) 2884 { 2885 switch (level) { 2886 case EXPER_RAWIP: 2887 return (0); 2888 default: 2889 return (1); 2890 } 2891 } 2892 2893 /* 2894 * This routine creates a T_UDERROR_IND message and passes it upstream. 2895 * The address and options are copied from the T_UNITDATA_REQ message 2896 * passed in mp. This message is freed. 2897 */ 2898 static void 2899 icmp_ud_err(queue_t *q, mblk_t *mp, t_scalar_t err) 2900 { 2901 struct T_unitdata_req *tudr; 2902 mblk_t *mp1; 2903 uchar_t *destaddr; 2904 t_scalar_t destlen; 2905 uchar_t *optaddr; 2906 t_scalar_t optlen; 2907 2908 if ((mp->b_wptr < mp->b_rptr) || 2909 (MBLKL(mp)) < sizeof (struct T_unitdata_req)) { 2910 goto done; 2911 } 2912 tudr = (struct T_unitdata_req *)mp->b_rptr; 2913 destaddr = mp->b_rptr + tudr->DEST_offset; 2914 if (destaddr < mp->b_rptr || destaddr >= mp->b_wptr || 2915 destaddr + tudr->DEST_length < mp->b_rptr || 2916 destaddr + tudr->DEST_length > mp->b_wptr) { 2917 goto done; 2918 } 2919 optaddr = mp->b_rptr + tudr->OPT_offset; 2920 if (optaddr < mp->b_rptr || optaddr >= mp->b_wptr || 2921 optaddr + tudr->OPT_length < mp->b_rptr || 2922 optaddr + tudr->OPT_length > mp->b_wptr) { 2923 goto done; 2924 } 2925 destlen = tudr->DEST_length; 2926 optlen = tudr->OPT_length; 2927 2928 mp1 = mi_tpi_uderror_ind((char *)destaddr, destlen, 2929 (char *)optaddr, optlen, err); 2930 if (mp1 != NULL) 2931 qreply(q, mp1); 2932 2933 done: 2934 freemsg(mp); 2935 } 2936 2937 static int 2938 rawip_do_unbind(conn_t *connp) 2939 { 2940 icmp_t *icmp = connp->conn_icmp; 2941 2942 mutex_enter(&connp->conn_lock); 2943 /* If a bind has not been done, we can't unbind. */ 2944 if (icmp->icmp_state == TS_UNBND) { 2945 mutex_exit(&connp->conn_lock); 2946 return (-TOUTSTATE); 2947 } 2948 connp->conn_saddr_v6 = ipv6_all_zeros; 2949 connp->conn_bound_addr_v6 = ipv6_all_zeros; 2950 connp->conn_laddr_v6 = ipv6_all_zeros; 2951 connp->conn_mcbc_bind = B_FALSE; 2952 connp->conn_lport = 0; 2953 connp->conn_fport = 0; 2954 /* In case we were also connected */ 2955 connp->conn_faddr_v6 = ipv6_all_zeros; 2956 connp->conn_v6lastdst = ipv6_all_zeros; 2957 2958 icmp->icmp_state = TS_UNBND; 2959 2960 (void) icmp_build_hdr_template(connp, &connp->conn_saddr_v6, 2961 &connp->conn_faddr_v6, connp->conn_flowinfo); 2962 mutex_exit(&connp->conn_lock); 2963 2964 ip_unbind(connp); 2965 return (0); 2966 } 2967 2968 /* 2969 * This routine is called by icmp_wput to handle T_UNBIND_REQ messages. 2970 * After some error checking, the message is passed downstream to ip. 2971 */ 2972 static void 2973 icmp_tpi_unbind(queue_t *q, mblk_t *mp) 2974 { 2975 conn_t *connp = Q_TO_CONN(q); 2976 int error; 2977 2978 ASSERT(mp->b_cont == NULL); 2979 error = rawip_do_unbind(connp); 2980 if (error) { 2981 if (error < 0) { 2982 icmp_err_ack(q, mp, -error, 0); 2983 } else { 2984 icmp_err_ack(q, mp, 0, error); 2985 } 2986 return; 2987 } 2988 2989 /* 2990 * Convert mp into a T_OK_ACK 2991 */ 2992 2993 mp = mi_tpi_ok_ack_alloc(mp); 2994 2995 /* 2996 * should not happen in practice... T_OK_ACK is smaller than the 2997 * original message. 2998 */ 2999 ASSERT(mp != NULL); 3000 ASSERT(((struct T_ok_ack *)mp->b_rptr)->PRIM_type == T_OK_ACK); 3001 qreply(q, mp); 3002 } 3003 3004 /* 3005 * Process IPv4 packets that already include an IP header. 3006 * Used when IP_HDRINCL has been set (implicit for IPPROTO_RAW and 3007 * IPPROTO_IGMP). 3008 * In this case we ignore the address and any options in the T_UNITDATA_REQ. 3009 * 3010 * The packet is assumed to have a base (20 byte) IP header followed 3011 * by the upper-layer protocol. We include any IP_OPTIONS including a 3012 * CIPSO label but otherwise preserve the base IP header. 3013 */ 3014 static int 3015 icmp_output_hdrincl(conn_t *connp, mblk_t *mp, cred_t *cr, pid_t pid) 3016 { 3017 icmp_t *icmp = connp->conn_icmp; 3018 icmp_stack_t *is = icmp->icmp_is; 3019 ipha_t iphas; 3020 ipha_t *ipha; 3021 int ip_hdr_length; 3022 int tp_hdr_len; 3023 ip_xmit_attr_t *ixa; 3024 ip_pkt_t *ipp; 3025 in6_addr_t v6src; 3026 in6_addr_t v6dst; 3027 in6_addr_t v6nexthop; 3028 int error; 3029 boolean_t do_ipsec; 3030 3031 /* 3032 * We need an exclusive copy of conn_ixa since the included IP 3033 * header could have any destination. 3034 * That copy has no pointers hence we 3035 * need to set them up once we've parsed the ancillary data. 3036 */ 3037 ixa = conn_get_ixa_exclusive(connp); 3038 if (ixa == NULL) { 3039 BUMP_MIB(&is->is_rawip_mib, rawipOutErrors); 3040 freemsg(mp); 3041 return (ENOMEM); 3042 } 3043 ASSERT(cr != NULL); 3044 /* 3045 * Caller has a reference on cr; from db_credp or because we 3046 * are running in process context. 3047 */ 3048 ASSERT(!(ixa->ixa_free_flags & IXA_FREE_CRED)); 3049 ixa->ixa_cred = cr; 3050 ixa->ixa_cpid = pid; 3051 if (is_system_labeled()) { 3052 /* We need to restart with a label based on the cred */ 3053 ip_xmit_attr_restore_tsl(ixa, ixa->ixa_cred); 3054 } 3055 3056 /* In case previous destination was multicast or multirt */ 3057 ip_attr_newdst(ixa); 3058 3059 /* Get a copy of conn_xmit_ipp since the TX label might change it */ 3060 ipp = kmem_zalloc(sizeof (*ipp), KM_NOSLEEP); 3061 if (ipp == NULL) { 3062 ASSERT(!(ixa->ixa_free_flags & IXA_FREE_CRED)); 3063 ixa->ixa_cred = connp->conn_cred; /* Restore */ 3064 ixa->ixa_cpid = connp->conn_cpid; 3065 ixa_refrele(ixa); 3066 BUMP_MIB(&is->is_rawip_mib, rawipOutErrors); 3067 freemsg(mp); 3068 return (ENOMEM); 3069 } 3070 mutex_enter(&connp->conn_lock); 3071 error = ip_pkt_copy(&connp->conn_xmit_ipp, ipp, KM_NOSLEEP); 3072 mutex_exit(&connp->conn_lock); 3073 if (error != 0) { 3074 BUMP_MIB(&is->is_rawip_mib, rawipOutErrors); 3075 freemsg(mp); 3076 goto done; 3077 } 3078 3079 /* Sanity check length of packet */ 3080 ipha = (ipha_t *)mp->b_rptr; 3081 3082 ip_hdr_length = IP_SIMPLE_HDR_LENGTH; 3083 if ((mp->b_wptr - mp->b_rptr) < IP_SIMPLE_HDR_LENGTH) { 3084 if (!pullupmsg(mp, IP_SIMPLE_HDR_LENGTH)) { 3085 BUMP_MIB(&is->is_rawip_mib, rawipOutErrors); 3086 freemsg(mp); 3087 goto done; 3088 } 3089 ipha = (ipha_t *)mp->b_rptr; 3090 } 3091 ipha->ipha_version_and_hdr_length = 3092 (IP_VERSION<<4) | (ip_hdr_length>>2); 3093 3094 /* 3095 * We set IXAF_DONTFRAG if the application set DF which makes 3096 * IP not fragment. 3097 */ 3098 ipha->ipha_fragment_offset_and_flags &= htons(IPH_DF); 3099 if (ipha->ipha_fragment_offset_and_flags & htons(IPH_DF)) 3100 ixa->ixa_flags |= (IXAF_DONTFRAG | IXAF_PMTU_IPV4_DF); 3101 else 3102 ixa->ixa_flags &= ~(IXAF_DONTFRAG | IXAF_PMTU_IPV4_DF); 3103 3104 /* Even for multicast and broadcast we honor the apps ttl */ 3105 ixa->ixa_flags |= IXAF_NO_TTL_CHANGE; 3106 3107 /* 3108 * No source verification for non-local addresses 3109 */ 3110 if (ipha->ipha_src != INADDR_ANY && 3111 ip_laddr_verify_v4(ipha->ipha_src, ixa->ixa_zoneid, 3112 is->is_netstack->netstack_ip, B_FALSE) 3113 != IPVL_UNICAST_UP) { 3114 ixa->ixa_flags &= ~IXAF_VERIFY_SOURCE; 3115 } 3116 3117 if (ipha->ipha_dst == INADDR_ANY) 3118 ipha->ipha_dst = htonl(INADDR_LOOPBACK); 3119 3120 IN6_IPADDR_TO_V4MAPPED(ipha->ipha_src, &v6src); 3121 IN6_IPADDR_TO_V4MAPPED(ipha->ipha_dst, &v6dst); 3122 3123 /* Defer IPsec if it might need to look at ICMP type/code */ 3124 do_ipsec = ipha->ipha_protocol != IPPROTO_ICMP; 3125 ixa->ixa_flags |= IXAF_IS_IPV4; 3126 3127 ip_attr_nexthop(ipp, ixa, &v6dst, &v6nexthop); 3128 error = ip_attr_connect(connp, ixa, &v6src, &v6dst, &v6nexthop, 3129 connp->conn_fport, &v6src, NULL, IPDF_ALLOW_MCBC | IPDF_VERIFY_DST | 3130 (do_ipsec ? IPDF_IPSEC : 0)); 3131 switch (error) { 3132 case 0: 3133 break; 3134 case EADDRNOTAVAIL: 3135 /* 3136 * IXAF_VERIFY_SOURCE tells us to pick a better source. 3137 * Don't have the application see that errno 3138 */ 3139 error = ENETUNREACH; 3140 goto failed; 3141 case ENETDOWN: 3142 /* 3143 * Have !ipif_addr_ready address; drop packet silently 3144 * until we can get applications to not send until we 3145 * are ready. 3146 */ 3147 error = 0; 3148 goto failed; 3149 case EHOSTUNREACH: 3150 case ENETUNREACH: 3151 if (ixa->ixa_ire != NULL) { 3152 /* 3153 * Let conn_ip_output/ire_send_noroute return 3154 * the error and send any local ICMP error. 3155 */ 3156 error = 0; 3157 break; 3158 } 3159 /* FALLTHRU */ 3160 default: 3161 failed: 3162 freemsg(mp); 3163 BUMP_MIB(&is->is_rawip_mib, rawipOutErrors); 3164 goto done; 3165 } 3166 if (ipha->ipha_src == INADDR_ANY) 3167 IN6_V4MAPPED_TO_IPADDR(&v6src, ipha->ipha_src); 3168 3169 /* 3170 * We might be going to a different destination than last time, 3171 * thus check that TX allows the communication and compute any 3172 * needed label. 3173 * 3174 * TSOL Note: We have an exclusive ipp and ixa for this thread so we 3175 * don't have to worry about concurrent threads. 3176 */ 3177 if (is_system_labeled()) { 3178 /* 3179 * Check whether Trusted Solaris policy allows communication 3180 * with this host, and pretend that the destination is 3181 * unreachable if not. 3182 * Compute any needed label and place it in ipp_label_v4/v6. 3183 * 3184 * Later conn_build_hdr_template/conn_prepend_hdr takes 3185 * ipp_label_v4/v6 to form the packet. 3186 * 3187 * Tsol note: We have ipp structure local to this thread so 3188 * no locking is needed. 3189 */ 3190 error = conn_update_label(connp, ixa, &v6dst, ipp); 3191 if (error != 0) { 3192 freemsg(mp); 3193 BUMP_MIB(&is->is_rawip_mib, rawipOutErrors); 3194 goto done; 3195 } 3196 } 3197 3198 /* 3199 * Save away a copy of the IPv4 header the application passed down 3200 * and then prepend an IPv4 header complete with any IP options 3201 * including label. 3202 * We need a struct copy since icmp_prepend_hdr will reuse the available 3203 * space in the mblk. 3204 */ 3205 iphas = *ipha; 3206 mp->b_rptr += IP_SIMPLE_HDR_LENGTH; 3207 3208 mp = icmp_prepend_hdr(connp, ixa, ipp, &v6src, &v6dst, 0, mp, &error); 3209 if (mp == NULL) { 3210 BUMP_MIB(&is->is_rawip_mib, rawipOutErrors); 3211 ASSERT(error != 0); 3212 goto done; 3213 } 3214 if (ixa->ixa_pktlen > IP_MAXPACKET) { 3215 error = EMSGSIZE; 3216 BUMP_MIB(&is->is_rawip_mib, rawipOutErrors); 3217 freemsg(mp); 3218 goto done; 3219 } 3220 /* Restore key parts of the header that the application passed down */ 3221 ipha = (ipha_t *)mp->b_rptr; 3222 ipha->ipha_type_of_service = iphas.ipha_type_of_service; 3223 ipha->ipha_ident = iphas.ipha_ident; 3224 ipha->ipha_fragment_offset_and_flags = 3225 iphas.ipha_fragment_offset_and_flags; 3226 ipha->ipha_ttl = iphas.ipha_ttl; 3227 ipha->ipha_protocol = iphas.ipha_protocol; 3228 ipha->ipha_src = iphas.ipha_src; 3229 ipha->ipha_dst = iphas.ipha_dst; 3230 3231 ixa->ixa_protocol = ipha->ipha_protocol; 3232 3233 /* 3234 * Make sure that the IP header plus any transport header that is 3235 * checksumed by ip_output is in the first mblk. (ip_output assumes 3236 * that at least the checksum field is in the first mblk.) 3237 */ 3238 switch (ipha->ipha_protocol) { 3239 case IPPROTO_UDP: 3240 tp_hdr_len = 8; 3241 break; 3242 case IPPROTO_TCP: 3243 tp_hdr_len = 20; 3244 break; 3245 default: 3246 tp_hdr_len = 0; 3247 break; 3248 } 3249 ip_hdr_length = IPH_HDR_LENGTH(ipha); 3250 if (mp->b_wptr - mp->b_rptr < ip_hdr_length + tp_hdr_len) { 3251 if (!pullupmsg(mp, ip_hdr_length + tp_hdr_len)) { 3252 BUMP_MIB(&is->is_rawip_mib, rawipOutErrors); 3253 if (mp->b_cont == NULL) 3254 error = EINVAL; 3255 else 3256 error = ENOMEM; 3257 freemsg(mp); 3258 goto done; 3259 } 3260 } 3261 3262 if (!do_ipsec) { 3263 /* Policy might differ for different ICMP type/code */ 3264 if (ixa->ixa_ipsec_policy != NULL) { 3265 IPPOL_REFRELE(ixa->ixa_ipsec_policy); 3266 ixa->ixa_ipsec_policy = NULL; 3267 ixa->ixa_flags &= ~IXAF_IPSEC_SECURE; 3268 } 3269 mp = ip_output_attach_policy(mp, ipha, NULL, connp, ixa); 3270 if (mp == NULL) { 3271 BUMP_MIB(&is->is_rawip_mib, rawipOutErrors); 3272 error = EHOSTUNREACH; /* IPsec policy failure */ 3273 goto done; 3274 } 3275 } 3276 3277 /* We're done. Pass the packet to ip. */ 3278 BUMP_MIB(&is->is_rawip_mib, rawipOutDatagrams); 3279 3280 error = conn_ip_output(mp, ixa); 3281 /* No rawipOutErrors if an error since IP increases its error counter */ 3282 switch (error) { 3283 case 0: 3284 break; 3285 case EWOULDBLOCK: 3286 (void) ixa_check_drain_insert(connp, ixa); 3287 error = 0; 3288 break; 3289 case EADDRNOTAVAIL: 3290 /* 3291 * IXAF_VERIFY_SOURCE tells us to pick a better source. 3292 * Don't have the application see that errno 3293 */ 3294 error = ENETUNREACH; 3295 break; 3296 } 3297 done: 3298 ASSERT(!(ixa->ixa_free_flags & IXA_FREE_CRED)); 3299 ixa->ixa_cred = connp->conn_cred; /* Restore */ 3300 ixa->ixa_cpid = connp->conn_cpid; 3301 ixa_refrele(ixa); 3302 ip_pkt_free(ipp); 3303 kmem_free(ipp, sizeof (*ipp)); 3304 return (error); 3305 } 3306 3307 static mblk_t * 3308 icmp_output_attach_policy(mblk_t *mp, conn_t *connp, ip_xmit_attr_t *ixa) 3309 { 3310 ipha_t *ipha = NULL; 3311 ip6_t *ip6h = NULL; 3312 3313 if (ixa->ixa_flags & IXAF_IS_IPV4) 3314 ipha = (ipha_t *)mp->b_rptr; 3315 else 3316 ip6h = (ip6_t *)mp->b_rptr; 3317 3318 if (ixa->ixa_ipsec_policy != NULL) { 3319 IPPOL_REFRELE(ixa->ixa_ipsec_policy); 3320 ixa->ixa_ipsec_policy = NULL; 3321 ixa->ixa_flags &= ~IXAF_IPSEC_SECURE; 3322 } 3323 return (ip_output_attach_policy(mp, ipha, ip6h, connp, ixa)); 3324 } 3325 3326 /* 3327 * Handle T_UNITDATA_REQ with options. Both IPv4 and IPv6 3328 * Either tudr_mp or msg is set. If tudr_mp we take ancillary data from 3329 * the TPI options, otherwise we take them from msg_control. 3330 * If both sin and sin6 is set it is a connected socket and we use conn_faddr. 3331 * Always consumes mp; never consumes tudr_mp. 3332 */ 3333 static int 3334 icmp_output_ancillary(conn_t *connp, sin_t *sin, sin6_t *sin6, mblk_t *mp, 3335 mblk_t *tudr_mp, struct nmsghdr *msg, cred_t *cr, pid_t pid) 3336 { 3337 icmp_t *icmp = connp->conn_icmp; 3338 icmp_stack_t *is = icmp->icmp_is; 3339 int error; 3340 ip_xmit_attr_t *ixa; 3341 ip_pkt_t *ipp; 3342 in6_addr_t v6src; 3343 in6_addr_t v6dst; 3344 in6_addr_t v6nexthop; 3345 in_port_t dstport; 3346 uint32_t flowinfo; 3347 int is_absreq_failure = 0; 3348 conn_opt_arg_t coas, *coa; 3349 3350 ASSERT(tudr_mp != NULL || msg != NULL); 3351 3352 /* 3353 * Get ixa before checking state to handle a disconnect race. 3354 * 3355 * We need an exclusive copy of conn_ixa since the ancillary data 3356 * options might modify it. That copy has no pointers hence we 3357 * need to set them up once we've parsed the ancillary data. 3358 */ 3359 ixa = conn_get_ixa_exclusive(connp); 3360 if (ixa == NULL) { 3361 BUMP_MIB(&is->is_rawip_mib, rawipOutErrors); 3362 freemsg(mp); 3363 return (ENOMEM); 3364 } 3365 ASSERT(cr != NULL); 3366 ASSERT(!(ixa->ixa_free_flags & IXA_FREE_CRED)); 3367 ixa->ixa_cred = cr; 3368 ixa->ixa_cpid = pid; 3369 if (is_system_labeled()) { 3370 /* We need to restart with a label based on the cred */ 3371 ip_xmit_attr_restore_tsl(ixa, ixa->ixa_cred); 3372 } 3373 3374 /* In case previous destination was multicast or multirt */ 3375 ip_attr_newdst(ixa); 3376 3377 /* Get a copy of conn_xmit_ipp since the options might change it */ 3378 ipp = kmem_zalloc(sizeof (*ipp), KM_NOSLEEP); 3379 if (ipp == NULL) { 3380 ASSERT(!(ixa->ixa_free_flags & IXA_FREE_CRED)); 3381 ixa->ixa_cred = connp->conn_cred; /* Restore */ 3382 ixa->ixa_cpid = connp->conn_cpid; 3383 ixa_refrele(ixa); 3384 BUMP_MIB(&is->is_rawip_mib, rawipOutErrors); 3385 freemsg(mp); 3386 return (ENOMEM); 3387 } 3388 mutex_enter(&connp->conn_lock); 3389 error = ip_pkt_copy(&connp->conn_xmit_ipp, ipp, KM_NOSLEEP); 3390 mutex_exit(&connp->conn_lock); 3391 if (error != 0) { 3392 BUMP_MIB(&is->is_rawip_mib, rawipOutErrors); 3393 freemsg(mp); 3394 goto done; 3395 } 3396 3397 /* 3398 * Parse the options and update ixa and ipp as a result. 3399 */ 3400 3401 coa = &coas; 3402 coa->coa_connp = connp; 3403 coa->coa_ixa = ixa; 3404 coa->coa_ipp = ipp; 3405 coa->coa_ancillary = B_TRUE; 3406 coa->coa_changed = 0; 3407 3408 if (msg != NULL) { 3409 error = process_auxiliary_options(connp, msg->msg_control, 3410 msg->msg_controllen, coa, &icmp_opt_obj, icmp_opt_set, cr); 3411 } else { 3412 struct T_unitdata_req *tudr; 3413 3414 tudr = (struct T_unitdata_req *)tudr_mp->b_rptr; 3415 ASSERT(tudr->PRIM_type == T_UNITDATA_REQ); 3416 error = tpi_optcom_buf(connp->conn_wq, tudr_mp, 3417 &tudr->OPT_length, tudr->OPT_offset, cr, &icmp_opt_obj, 3418 coa, &is_absreq_failure); 3419 } 3420 if (error != 0) { 3421 /* 3422 * Note: No special action needed in this 3423 * module for "is_absreq_failure" 3424 */ 3425 freemsg(mp); 3426 BUMP_MIB(&is->is_rawip_mib, rawipOutErrors); 3427 goto done; 3428 } 3429 ASSERT(is_absreq_failure == 0); 3430 3431 mutex_enter(&connp->conn_lock); 3432 /* 3433 * If laddr is unspecified then we look at sin6_src_id. 3434 * We will give precedence to a source address set with IPV6_PKTINFO 3435 * (aka IPPF_ADDR) but that is handled in build_hdrs. However, we don't 3436 * want ip_attr_connect to select a source (since it can fail) when 3437 * IPV6_PKTINFO is specified. 3438 * If this doesn't result in a source address then we get a source 3439 * from ip_attr_connect() below. 3440 */ 3441 v6src = connp->conn_saddr_v6; 3442 if (sin != NULL) { 3443 IN6_IPADDR_TO_V4MAPPED(sin->sin_addr.s_addr, &v6dst); 3444 dstport = sin->sin_port; 3445 flowinfo = 0; 3446 ixa->ixa_flags &= ~IXAF_SCOPEID_SET; 3447 ixa->ixa_flags |= IXAF_IS_IPV4; 3448 } else if (sin6 != NULL) { 3449 boolean_t v4mapped; 3450 uint_t srcid; 3451 3452 v6dst = sin6->sin6_addr; 3453 dstport = sin6->sin6_port; 3454 flowinfo = sin6->sin6_flowinfo; 3455 srcid = sin6->__sin6_src_id; 3456 if (IN6_IS_ADDR_LINKSCOPE(&v6dst) && sin6->sin6_scope_id != 0) { 3457 ixa->ixa_scopeid = sin6->sin6_scope_id; 3458 ixa->ixa_flags |= IXAF_SCOPEID_SET; 3459 } else { 3460 ixa->ixa_flags &= ~IXAF_SCOPEID_SET; 3461 } 3462 v4mapped = IN6_IS_ADDR_V4MAPPED(&v6dst); 3463 if (v4mapped) 3464 ixa->ixa_flags |= IXAF_IS_IPV4; 3465 else 3466 ixa->ixa_flags &= ~IXAF_IS_IPV4; 3467 if (srcid != 0 && IN6_IS_ADDR_UNSPECIFIED(&v6src)) { 3468 if (!ip_srcid_find_id(srcid, &v6src, IPCL_ZONEID(connp), 3469 v4mapped, connp->conn_netstack)) { 3470 /* Mismatched v4mapped/v6 specified by srcid. */ 3471 mutex_exit(&connp->conn_lock); 3472 error = EADDRNOTAVAIL; 3473 goto failed; /* Does freemsg() and mib. */ 3474 } 3475 } 3476 } else { 3477 /* Connected case */ 3478 v6dst = connp->conn_faddr_v6; 3479 flowinfo = connp->conn_flowinfo; 3480 } 3481 mutex_exit(&connp->conn_lock); 3482 /* Handle IP_PKTINFO/IPV6_PKTINFO setting source address. */ 3483 if (ipp->ipp_fields & IPPF_ADDR) { 3484 if (ixa->ixa_flags & IXAF_IS_IPV4) { 3485 if (IN6_IS_ADDR_V4MAPPED(&ipp->ipp_addr)) 3486 v6src = ipp->ipp_addr; 3487 } else { 3488 if (!IN6_IS_ADDR_V4MAPPED(&ipp->ipp_addr)) 3489 v6src = ipp->ipp_addr; 3490 } 3491 } 3492 /* 3493 * Allow source not assigned to the system 3494 * only if it is not a local addresses 3495 */ 3496 if (!V6_OR_V4_INADDR_ANY(v6src)) { 3497 ip_laddr_t laddr_type; 3498 3499 if (ixa->ixa_flags & IXAF_IS_IPV4) { 3500 ipaddr_t v4src; 3501 3502 IN6_V4MAPPED_TO_IPADDR(&v6src, v4src); 3503 laddr_type = ip_laddr_verify_v4(v4src, ixa->ixa_zoneid, 3504 is->is_netstack->netstack_ip, B_FALSE); 3505 } else { 3506 laddr_type = ip_laddr_verify_v6(&v6src, ixa->ixa_zoneid, 3507 is->is_netstack->netstack_ip, B_FALSE, B_FALSE); 3508 } 3509 if (laddr_type != IPVL_UNICAST_UP) 3510 ixa->ixa_flags &= ~IXAF_VERIFY_SOURCE; 3511 } 3512 3513 ip_attr_nexthop(ipp, ixa, &v6dst, &v6nexthop); 3514 error = ip_attr_connect(connp, ixa, &v6src, &v6dst, &v6nexthop, dstport, 3515 &v6src, NULL, IPDF_ALLOW_MCBC | IPDF_VERIFY_DST); 3516 3517 switch (error) { 3518 case 0: 3519 break; 3520 case EADDRNOTAVAIL: 3521 /* 3522 * IXAF_VERIFY_SOURCE tells us to pick a better source. 3523 * Don't have the application see that errno 3524 */ 3525 error = ENETUNREACH; 3526 goto failed; 3527 case ENETDOWN: 3528 /* 3529 * Have !ipif_addr_ready address; drop packet silently 3530 * until we can get applications to not send until we 3531 * are ready. 3532 */ 3533 error = 0; 3534 goto failed; 3535 case EHOSTUNREACH: 3536 case ENETUNREACH: 3537 if (ixa->ixa_ire != NULL) { 3538 /* 3539 * Let conn_ip_output/ire_send_noroute return 3540 * the error and send any local ICMP error. 3541 */ 3542 error = 0; 3543 break; 3544 } 3545 /* FALLTHRU */ 3546 default: 3547 failed: 3548 freemsg(mp); 3549 BUMP_MIB(&is->is_rawip_mib, rawipOutErrors); 3550 goto done; 3551 } 3552 3553 /* 3554 * We might be going to a different destination than last time, 3555 * thus check that TX allows the communication and compute any 3556 * needed label. 3557 * 3558 * TSOL Note: We have an exclusive ipp and ixa for this thread so we 3559 * don't have to worry about concurrent threads. 3560 */ 3561 if (is_system_labeled()) { 3562 /* 3563 * Check whether Trusted Solaris policy allows communication 3564 * with this host, and pretend that the destination is 3565 * unreachable if not. 3566 * Compute any needed label and place it in ipp_label_v4/v6. 3567 * 3568 * Later conn_build_hdr_template/conn_prepend_hdr takes 3569 * ipp_label_v4/v6 to form the packet. 3570 * 3571 * Tsol note: We have ipp structure local to this thread so 3572 * no locking is needed. 3573 */ 3574 error = conn_update_label(connp, ixa, &v6dst, ipp); 3575 if (error != 0) { 3576 freemsg(mp); 3577 BUMP_MIB(&is->is_rawip_mib, rawipOutErrors); 3578 goto done; 3579 } 3580 } 3581 mp = icmp_prepend_hdr(connp, ixa, ipp, &v6src, &v6dst, flowinfo, mp, 3582 &error); 3583 if (mp == NULL) { 3584 BUMP_MIB(&is->is_rawip_mib, rawipOutErrors); 3585 ASSERT(error != 0); 3586 goto done; 3587 } 3588 if (ixa->ixa_pktlen > IP_MAXPACKET) { 3589 error = EMSGSIZE; 3590 BUMP_MIB(&is->is_rawip_mib, rawipOutErrors); 3591 freemsg(mp); 3592 goto done; 3593 } 3594 3595 /* Policy might differ for different ICMP type/code */ 3596 mp = icmp_output_attach_policy(mp, connp, ixa); 3597 if (mp == NULL) { 3598 BUMP_MIB(&is->is_rawip_mib, rawipOutErrors); 3599 error = EHOSTUNREACH; /* IPsec policy failure */ 3600 goto done; 3601 } 3602 3603 /* We're done. Pass the packet to ip. */ 3604 BUMP_MIB(&is->is_rawip_mib, rawipOutDatagrams); 3605 3606 error = conn_ip_output(mp, ixa); 3607 if (!connp->conn_unspec_src) 3608 ixa->ixa_flags |= IXAF_VERIFY_SOURCE; 3609 /* No rawipOutErrors if an error since IP increases its error counter */ 3610 switch (error) { 3611 case 0: 3612 break; 3613 case EWOULDBLOCK: 3614 (void) ixa_check_drain_insert(connp, ixa); 3615 error = 0; 3616 break; 3617 case EADDRNOTAVAIL: 3618 /* 3619 * IXAF_VERIFY_SOURCE tells us to pick a better source. 3620 * Don't have the application see that errno 3621 */ 3622 error = ENETUNREACH; 3623 /* FALLTHRU */ 3624 default: 3625 mutex_enter(&connp->conn_lock); 3626 /* 3627 * Clear the source and v6lastdst so we call ip_attr_connect 3628 * for the next packet and try to pick a better source. 3629 */ 3630 if (connp->conn_mcbc_bind) 3631 connp->conn_saddr_v6 = ipv6_all_zeros; 3632 else 3633 connp->conn_saddr_v6 = connp->conn_bound_addr_v6; 3634 connp->conn_v6lastdst = ipv6_all_zeros; 3635 mutex_exit(&connp->conn_lock); 3636 break; 3637 } 3638 done: 3639 ASSERT(!(ixa->ixa_free_flags & IXA_FREE_CRED)); 3640 ixa->ixa_cred = connp->conn_cred; /* Restore */ 3641 ixa->ixa_cpid = connp->conn_cpid; 3642 ixa_refrele(ixa); 3643 ip_pkt_free(ipp); 3644 kmem_free(ipp, sizeof (*ipp)); 3645 return (error); 3646 } 3647 3648 /* 3649 * Handle sending an M_DATA for a connected socket. 3650 * Handles both IPv4 and IPv6. 3651 */ 3652 int 3653 icmp_output_connected(conn_t *connp, mblk_t *mp, cred_t *cr, pid_t pid) 3654 { 3655 icmp_t *icmp = connp->conn_icmp; 3656 icmp_stack_t *is = icmp->icmp_is; 3657 int error; 3658 ip_xmit_attr_t *ixa; 3659 boolean_t do_ipsec; 3660 3661 /* 3662 * If no other thread is using conn_ixa this just gets a reference to 3663 * conn_ixa. Otherwise we get a safe copy of conn_ixa. 3664 */ 3665 ixa = conn_get_ixa(connp, B_FALSE); 3666 if (ixa == NULL) { 3667 BUMP_MIB(&is->is_rawip_mib, rawipOutErrors); 3668 freemsg(mp); 3669 return (ENOMEM); 3670 } 3671 3672 ASSERT(cr != NULL); 3673 ASSERT(!(ixa->ixa_free_flags & IXA_FREE_CRED)); 3674 ixa->ixa_cred = cr; 3675 ixa->ixa_cpid = pid; 3676 3677 /* Defer IPsec if it might need to look at ICMP type/code */ 3678 switch (ixa->ixa_protocol) { 3679 case IPPROTO_ICMP: 3680 case IPPROTO_ICMPV6: 3681 do_ipsec = B_FALSE; 3682 break; 3683 default: 3684 do_ipsec = B_TRUE; 3685 } 3686 3687 mutex_enter(&connp->conn_lock); 3688 mp = icmp_prepend_header_template(connp, ixa, mp, 3689 &connp->conn_saddr_v6, connp->conn_flowinfo, &error); 3690 3691 if (mp == NULL) { 3692 ASSERT(error != 0); 3693 mutex_exit(&connp->conn_lock); 3694 ASSERT(!(ixa->ixa_free_flags & IXA_FREE_CRED)); 3695 ixa->ixa_cred = connp->conn_cred; /* Restore */ 3696 ixa->ixa_cpid = connp->conn_cpid; 3697 ixa_refrele(ixa); 3698 BUMP_MIB(&is->is_rawip_mib, rawipOutErrors); 3699 freemsg(mp); 3700 return (error); 3701 } 3702 3703 if (!do_ipsec) { 3704 /* Policy might differ for different ICMP type/code */ 3705 mp = icmp_output_attach_policy(mp, connp, ixa); 3706 if (mp == NULL) { 3707 mutex_exit(&connp->conn_lock); 3708 BUMP_MIB(&is->is_rawip_mib, rawipOutErrors); 3709 ASSERT(!(ixa->ixa_free_flags & IXA_FREE_CRED)); 3710 ixa->ixa_cred = connp->conn_cred; /* Restore */ 3711 ixa->ixa_cpid = connp->conn_cpid; 3712 ixa_refrele(ixa); 3713 return (EHOSTUNREACH); /* IPsec policy failure */ 3714 } 3715 } 3716 3717 /* 3718 * In case we got a safe copy of conn_ixa, or if opt_set made us a new 3719 * safe copy, then we need to fill in any pointers in it. 3720 */ 3721 if (ixa->ixa_ire == NULL) { 3722 in6_addr_t faddr, saddr; 3723 in6_addr_t nexthop; 3724 in_port_t fport; 3725 3726 saddr = connp->conn_saddr_v6; 3727 faddr = connp->conn_faddr_v6; 3728 fport = connp->conn_fport; 3729 ip_attr_nexthop(&connp->conn_xmit_ipp, ixa, &faddr, &nexthop); 3730 mutex_exit(&connp->conn_lock); 3731 3732 error = ip_attr_connect(connp, ixa, &saddr, &faddr, &nexthop, 3733 fport, NULL, NULL, IPDF_ALLOW_MCBC | IPDF_VERIFY_DST | 3734 (do_ipsec ? IPDF_IPSEC : 0)); 3735 switch (error) { 3736 case 0: 3737 break; 3738 case EADDRNOTAVAIL: 3739 /* 3740 * IXAF_VERIFY_SOURCE tells us to pick a better source. 3741 * Don't have the application see that errno 3742 */ 3743 error = ENETUNREACH; 3744 goto failed; 3745 case ENETDOWN: 3746 /* 3747 * Have !ipif_addr_ready address; drop packet silently 3748 * until we can get applications to not send until we 3749 * are ready. 3750 */ 3751 error = 0; 3752 goto failed; 3753 case EHOSTUNREACH: 3754 case ENETUNREACH: 3755 if (ixa->ixa_ire != NULL) { 3756 /* 3757 * Let conn_ip_output/ire_send_noroute return 3758 * the error and send any local ICMP error. 3759 */ 3760 error = 0; 3761 break; 3762 } 3763 /* FALLTHRU */ 3764 default: 3765 failed: 3766 ASSERT(!(ixa->ixa_free_flags & IXA_FREE_CRED)); 3767 ixa->ixa_cred = connp->conn_cred; /* Restore */ 3768 ixa->ixa_cpid = connp->conn_cpid; 3769 ixa_refrele(ixa); 3770 BUMP_MIB(&is->is_rawip_mib, rawipOutErrors); 3771 freemsg(mp); 3772 return (error); 3773 } 3774 } else { 3775 /* Done with conn_t */ 3776 mutex_exit(&connp->conn_lock); 3777 } 3778 3779 /* We're done. Pass the packet to ip. */ 3780 BUMP_MIB(&is->is_rawip_mib, rawipOutDatagrams); 3781 3782 error = conn_ip_output(mp, ixa); 3783 /* No rawipOutErrors if an error since IP increases its error counter */ 3784 switch (error) { 3785 case 0: 3786 break; 3787 case EWOULDBLOCK: 3788 (void) ixa_check_drain_insert(connp, ixa); 3789 error = 0; 3790 break; 3791 case EADDRNOTAVAIL: 3792 /* 3793 * IXAF_VERIFY_SOURCE tells us to pick a better source. 3794 * Don't have the application see that errno 3795 */ 3796 error = ENETUNREACH; 3797 break; 3798 } 3799 ASSERT(!(ixa->ixa_free_flags & IXA_FREE_CRED)); 3800 ixa->ixa_cred = connp->conn_cred; /* Restore */ 3801 ixa->ixa_cpid = connp->conn_cpid; 3802 ixa_refrele(ixa); 3803 return (error); 3804 } 3805 3806 /* 3807 * Handle sending an M_DATA to the last destination. 3808 * Handles both IPv4 and IPv6. 3809 * 3810 * NOTE: The caller must hold conn_lock and we drop it here. 3811 */ 3812 int 3813 icmp_output_lastdst(conn_t *connp, mblk_t *mp, cred_t *cr, pid_t pid, 3814 ip_xmit_attr_t *ixa) 3815 { 3816 icmp_t *icmp = connp->conn_icmp; 3817 icmp_stack_t *is = icmp->icmp_is; 3818 int error; 3819 boolean_t do_ipsec; 3820 3821 ASSERT(MUTEX_HELD(&connp->conn_lock)); 3822 ASSERT(ixa != NULL); 3823 3824 ASSERT(cr != NULL); 3825 ASSERT(!(ixa->ixa_free_flags & IXA_FREE_CRED)); 3826 ixa->ixa_cred = cr; 3827 ixa->ixa_cpid = pid; 3828 3829 /* Defer IPsec if it might need to look at ICMP type/code */ 3830 switch (ixa->ixa_protocol) { 3831 case IPPROTO_ICMP: 3832 case IPPROTO_ICMPV6: 3833 do_ipsec = B_FALSE; 3834 break; 3835 default: 3836 do_ipsec = B_TRUE; 3837 } 3838 3839 3840 mp = icmp_prepend_header_template(connp, ixa, mp, 3841 &connp->conn_v6lastsrc, connp->conn_lastflowinfo, &error); 3842 3843 if (mp == NULL) { 3844 ASSERT(error != 0); 3845 mutex_exit(&connp->conn_lock); 3846 ASSERT(!(ixa->ixa_free_flags & IXA_FREE_CRED)); 3847 ixa->ixa_cred = connp->conn_cred; /* Restore */ 3848 ixa->ixa_cpid = connp->conn_cpid; 3849 ixa_refrele(ixa); 3850 BUMP_MIB(&is->is_rawip_mib, rawipOutErrors); 3851 freemsg(mp); 3852 return (error); 3853 } 3854 3855 if (!do_ipsec) { 3856 /* Policy might differ for different ICMP type/code */ 3857 mp = icmp_output_attach_policy(mp, connp, ixa); 3858 if (mp == NULL) { 3859 mutex_exit(&connp->conn_lock); 3860 BUMP_MIB(&is->is_rawip_mib, rawipOutErrors); 3861 ASSERT(!(ixa->ixa_free_flags & IXA_FREE_CRED)); 3862 ixa->ixa_cred = connp->conn_cred; /* Restore */ 3863 ixa->ixa_cpid = connp->conn_cpid; 3864 ixa_refrele(ixa); 3865 return (EHOSTUNREACH); /* IPsec policy failure */ 3866 } 3867 } 3868 3869 /* 3870 * In case we got a safe copy of conn_ixa, or if opt_set made us a new 3871 * safe copy, then we need to fill in any pointers in it. 3872 */ 3873 if (ixa->ixa_ire == NULL) { 3874 in6_addr_t lastdst, lastsrc; 3875 in6_addr_t nexthop; 3876 in_port_t lastport; 3877 3878 lastsrc = connp->conn_v6lastsrc; 3879 lastdst = connp->conn_v6lastdst; 3880 lastport = connp->conn_lastdstport; 3881 ip_attr_nexthop(&connp->conn_xmit_ipp, ixa, &lastdst, &nexthop); 3882 mutex_exit(&connp->conn_lock); 3883 3884 error = ip_attr_connect(connp, ixa, &lastsrc, &lastdst, 3885 &nexthop, lastport, NULL, NULL, IPDF_ALLOW_MCBC | 3886 IPDF_VERIFY_DST | (do_ipsec ? IPDF_IPSEC : 0)); 3887 switch (error) { 3888 case 0: 3889 break; 3890 case EADDRNOTAVAIL: 3891 /* 3892 * IXAF_VERIFY_SOURCE tells us to pick a better source. 3893 * Don't have the application see that errno 3894 */ 3895 error = ENETUNREACH; 3896 goto failed; 3897 case ENETDOWN: 3898 /* 3899 * Have !ipif_addr_ready address; drop packet silently 3900 * until we can get applications to not send until we 3901 * are ready. 3902 */ 3903 error = 0; 3904 goto failed; 3905 case EHOSTUNREACH: 3906 case ENETUNREACH: 3907 if (ixa->ixa_ire != NULL) { 3908 /* 3909 * Let conn_ip_output/ire_send_noroute return 3910 * the error and send any local ICMP error. 3911 */ 3912 error = 0; 3913 break; 3914 } 3915 /* FALLTHRU */ 3916 default: 3917 failed: 3918 ASSERT(!(ixa->ixa_free_flags & IXA_FREE_CRED)); 3919 ixa->ixa_cred = connp->conn_cred; /* Restore */ 3920 ixa->ixa_cpid = connp->conn_cpid; 3921 ixa_refrele(ixa); 3922 BUMP_MIB(&is->is_rawip_mib, rawipOutErrors); 3923 freemsg(mp); 3924 return (error); 3925 } 3926 } else { 3927 /* Done with conn_t */ 3928 mutex_exit(&connp->conn_lock); 3929 } 3930 3931 /* We're done. Pass the packet to ip. */ 3932 BUMP_MIB(&is->is_rawip_mib, rawipOutDatagrams); 3933 error = conn_ip_output(mp, ixa); 3934 /* No rawipOutErrors if an error since IP increases its error counter */ 3935 switch (error) { 3936 case 0: 3937 break; 3938 case EWOULDBLOCK: 3939 (void) ixa_check_drain_insert(connp, ixa); 3940 error = 0; 3941 break; 3942 case EADDRNOTAVAIL: 3943 /* 3944 * IXAF_VERIFY_SOURCE tells us to pick a better source. 3945 * Don't have the application see that errno 3946 */ 3947 error = ENETUNREACH; 3948 /* FALLTHRU */ 3949 default: 3950 mutex_enter(&connp->conn_lock); 3951 /* 3952 * Clear the source and v6lastdst so we call ip_attr_connect 3953 * for the next packet and try to pick a better source. 3954 */ 3955 if (connp->conn_mcbc_bind) 3956 connp->conn_saddr_v6 = ipv6_all_zeros; 3957 else 3958 connp->conn_saddr_v6 = connp->conn_bound_addr_v6; 3959 connp->conn_v6lastdst = ipv6_all_zeros; 3960 mutex_exit(&connp->conn_lock); 3961 break; 3962 } 3963 ASSERT(!(ixa->ixa_free_flags & IXA_FREE_CRED)); 3964 ixa->ixa_cred = connp->conn_cred; /* Restore */ 3965 ixa->ixa_cpid = connp->conn_cpid; 3966 ixa_refrele(ixa); 3967 return (error); 3968 } 3969 3970 3971 /* 3972 * Prepend the header template and then fill in the source and 3973 * flowinfo. The caller needs to handle the destination address since 3974 * it's setting is different if rthdr or source route. 3975 * 3976 * Returns NULL is allocation failed or if the packet would exceed IP_MAXPACKET. 3977 * When it returns NULL it sets errorp. 3978 */ 3979 static mblk_t * 3980 icmp_prepend_header_template(conn_t *connp, ip_xmit_attr_t *ixa, mblk_t *mp, 3981 const in6_addr_t *v6src, uint32_t flowinfo, int *errorp) 3982 { 3983 icmp_t *icmp = connp->conn_icmp; 3984 icmp_stack_t *is = icmp->icmp_is; 3985 uint_t pktlen; 3986 uint_t copylen; 3987 uint8_t *iph; 3988 uint_t ip_hdr_length; 3989 uint32_t cksum; 3990 ip_pkt_t *ipp; 3991 3992 ASSERT(MUTEX_HELD(&connp->conn_lock)); 3993 3994 /* 3995 * Copy the header template. 3996 */ 3997 copylen = connp->conn_ht_iphc_len; 3998 pktlen = copylen + msgdsize(mp); 3999 if (pktlen > IP_MAXPACKET) { 4000 freemsg(mp); 4001 *errorp = EMSGSIZE; 4002 return (NULL); 4003 } 4004 ixa->ixa_pktlen = pktlen; 4005 4006 /* check/fix buffer config, setup pointers into it */ 4007 iph = mp->b_rptr - copylen; 4008 if (DB_REF(mp) != 1 || iph < DB_BASE(mp) || !OK_32PTR(iph)) { 4009 mblk_t *mp1; 4010 4011 mp1 = allocb(copylen + is->is_wroff_extra, BPRI_MED); 4012 if (mp1 == NULL) { 4013 freemsg(mp); 4014 *errorp = ENOMEM; 4015 return (NULL); 4016 } 4017 mp1->b_wptr = DB_LIM(mp1); 4018 mp1->b_cont = mp; 4019 mp = mp1; 4020 iph = (mp->b_wptr - copylen); 4021 } 4022 mp->b_rptr = iph; 4023 bcopy(connp->conn_ht_iphc, iph, copylen); 4024 ip_hdr_length = (uint_t)(connp->conn_ht_ulp - connp->conn_ht_iphc); 4025 4026 ixa->ixa_ip_hdr_length = ip_hdr_length; 4027 4028 /* 4029 * Prepare for ICMPv6 checksum done in IP. 4030 * 4031 * icmp_build_hdr_template has already massaged any routing header 4032 * and placed the result in conn_sum. 4033 * 4034 * We make it easy for IP to include our pseudo header 4035 * by putting our length (and any routing header adjustment) 4036 * in the ICMPv6 checksum field. 4037 */ 4038 cksum = pktlen - ip_hdr_length; 4039 4040 cksum += connp->conn_sum; 4041 cksum = (cksum >> 16) + (cksum & 0xFFFF); 4042 ASSERT(cksum < 0x10000); 4043 4044 ipp = &connp->conn_xmit_ipp; 4045 if (ixa->ixa_flags & IXAF_IS_IPV4) { 4046 ipha_t *ipha = (ipha_t *)iph; 4047 4048 ipha->ipha_length = htons((uint16_t)pktlen); 4049 4050 /* if IP_PKTINFO specified an addres it wins over bind() */ 4051 if ((ipp->ipp_fields & IPPF_ADDR) && 4052 IN6_IS_ADDR_V4MAPPED(&ipp->ipp_addr)) { 4053 ASSERT(ipp->ipp_addr_v4 != INADDR_ANY); 4054 ipha->ipha_src = ipp->ipp_addr_v4; 4055 } else { 4056 IN6_V4MAPPED_TO_IPADDR(v6src, ipha->ipha_src); 4057 } 4058 } else { 4059 ip6_t *ip6h = (ip6_t *)iph; 4060 uint_t cksum_offset = 0; 4061 4062 ip6h->ip6_plen = htons((uint16_t)(pktlen - IPV6_HDR_LEN)); 4063 4064 /* if IP_PKTINFO specified an addres it wins over bind() */ 4065 if ((ipp->ipp_fields & IPPF_ADDR) && 4066 !IN6_IS_ADDR_V4MAPPED(&ipp->ipp_addr)) { 4067 ASSERT(!IN6_IS_ADDR_UNSPECIFIED(&ipp->ipp_addr)); 4068 ip6h->ip6_src = ipp->ipp_addr; 4069 } else { 4070 ip6h->ip6_src = *v6src; 4071 } 4072 ip6h->ip6_vcf = 4073 (IPV6_DEFAULT_VERS_AND_FLOW & IPV6_VERS_AND_FLOW_MASK) | 4074 (flowinfo & ~IPV6_VERS_AND_FLOW_MASK); 4075 if (ipp->ipp_fields & IPPF_TCLASS) { 4076 /* Overrides the class part of flowinfo */ 4077 ip6h->ip6_vcf = IPV6_TCLASS_FLOW(ip6h->ip6_vcf, 4078 ipp->ipp_tclass); 4079 } 4080 4081 if (ixa->ixa_flags & IXAF_SET_ULP_CKSUM) { 4082 if (connp->conn_proto == IPPROTO_ICMPV6) { 4083 cksum_offset = ixa->ixa_ip_hdr_length + 4084 offsetof(icmp6_t, icmp6_cksum); 4085 } else if (ixa->ixa_flags & IXAF_SET_RAW_CKSUM) { 4086 cksum_offset = ixa->ixa_ip_hdr_length + 4087 ixa->ixa_raw_cksum_offset; 4088 } 4089 } 4090 if (cksum_offset != 0) { 4091 uint16_t *ptr; 4092 4093 /* Make sure the checksum fits in the first mblk */ 4094 if (cksum_offset + sizeof (short) > MBLKL(mp)) { 4095 mblk_t *mp1; 4096 4097 mp1 = msgpullup(mp, 4098 cksum_offset + sizeof (short)); 4099 freemsg(mp); 4100 if (mp1 == NULL) { 4101 *errorp = ENOMEM; 4102 return (NULL); 4103 } 4104 mp = mp1; 4105 iph = mp->b_rptr; 4106 ip6h = (ip6_t *)iph; 4107 } 4108 ptr = (uint16_t *)(mp->b_rptr + cksum_offset); 4109 *ptr = htons(cksum); 4110 } 4111 } 4112 4113 return (mp); 4114 } 4115 4116 /* 4117 * This routine handles all messages passed downstream. It either 4118 * consumes the message or passes it downstream; it never queues a 4119 * a message. 4120 */ 4121 int 4122 icmp_wput(queue_t *q, mblk_t *mp) 4123 { 4124 sin6_t *sin6; 4125 sin_t *sin = NULL; 4126 uint_t srcid; 4127 conn_t *connp = Q_TO_CONN(q); 4128 icmp_t *icmp = connp->conn_icmp; 4129 int error = 0; 4130 struct sockaddr *addr = NULL; 4131 socklen_t addrlen; 4132 icmp_stack_t *is = icmp->icmp_is; 4133 struct T_unitdata_req *tudr; 4134 mblk_t *data_mp; 4135 cred_t *cr; 4136 pid_t pid; 4137 4138 /* 4139 * We directly handle several cases here: T_UNITDATA_REQ message 4140 * coming down as M_PROTO/M_PCPROTO and M_DATA messages for connected 4141 * socket. 4142 */ 4143 switch (DB_TYPE(mp)) { 4144 case M_DATA: 4145 /* sockfs never sends down M_DATA */ 4146 BUMP_MIB(&is->is_rawip_mib, rawipOutErrors); 4147 freemsg(mp); 4148 return (0); 4149 4150 case M_PROTO: 4151 case M_PCPROTO: 4152 tudr = (struct T_unitdata_req *)mp->b_rptr; 4153 if (MBLKL(mp) < sizeof (*tudr) || 4154 ((t_primp_t)mp->b_rptr)->type != T_UNITDATA_REQ) { 4155 icmp_wput_other(q, mp); 4156 return (0); 4157 } 4158 break; 4159 4160 default: 4161 icmp_wput_other(q, mp); 4162 return (0); 4163 } 4164 4165 /* Handle valid T_UNITDATA_REQ here */ 4166 data_mp = mp->b_cont; 4167 if (data_mp == NULL) { 4168 error = EPROTO; 4169 goto ud_error2; 4170 } 4171 mp->b_cont = NULL; 4172 4173 if (!MBLKIN(mp, 0, tudr->DEST_offset + tudr->DEST_length)) { 4174 error = EADDRNOTAVAIL; 4175 goto ud_error2; 4176 } 4177 4178 /* 4179 * All Solaris components should pass a db_credp 4180 * for this message, hence we ASSERT. 4181 * On production kernels we return an error to be robust against 4182 * random streams modules sitting on top of us. 4183 */ 4184 cr = msg_getcred(mp, &pid); 4185 ASSERT(cr != NULL); 4186 if (cr == NULL) { 4187 error = EINVAL; 4188 goto ud_error2; 4189 } 4190 4191 /* 4192 * If a port has not been bound to the stream, fail. 4193 * This is not a problem when sockfs is directly 4194 * above us, because it will ensure that the socket 4195 * is first bound before allowing data to be sent. 4196 */ 4197 if (icmp->icmp_state == TS_UNBND) { 4198 error = EPROTO; 4199 goto ud_error2; 4200 } 4201 addr = (struct sockaddr *)&mp->b_rptr[tudr->DEST_offset]; 4202 addrlen = tudr->DEST_length; 4203 4204 switch (connp->conn_family) { 4205 case AF_INET6: 4206 sin6 = (sin6_t *)addr; 4207 if (!OK_32PTR((char *)sin6) || (addrlen != sizeof (sin6_t)) || 4208 (sin6->sin6_family != AF_INET6)) { 4209 error = EADDRNOTAVAIL; 4210 goto ud_error2; 4211 } 4212 4213 /* No support for mapped addresses on raw sockets */ 4214 if (IN6_IS_ADDR_V4MAPPED(&sin6->sin6_addr)) { 4215 error = EADDRNOTAVAIL; 4216 goto ud_error2; 4217 } 4218 srcid = sin6->__sin6_src_id; 4219 4220 /* 4221 * If the local address is a mapped address return 4222 * an error. 4223 * It would be possible to send an IPv6 packet but the 4224 * response would never make it back to the application 4225 * since it is bound to a mapped address. 4226 */ 4227 if (IN6_IS_ADDR_V4MAPPED(&connp->conn_saddr_v6)) { 4228 error = EADDRNOTAVAIL; 4229 goto ud_error2; 4230 } 4231 4232 if (IN6_IS_ADDR_UNSPECIFIED(&sin6->sin6_addr)) 4233 sin6->sin6_addr = ipv6_loopback; 4234 4235 if (tudr->OPT_length != 0) { 4236 /* 4237 * If we are connected then the destination needs to be 4238 * the same as the connected one. 4239 */ 4240 if (icmp->icmp_state == TS_DATA_XFER && 4241 !conn_same_as_last_v6(connp, sin6)) { 4242 error = EISCONN; 4243 goto ud_error2; 4244 } 4245 error = icmp_output_ancillary(connp, NULL, sin6, 4246 data_mp, mp, NULL, cr, pid); 4247 } else { 4248 ip_xmit_attr_t *ixa; 4249 4250 /* 4251 * We have to allocate an ip_xmit_attr_t before we grab 4252 * conn_lock and we need to hold conn_lock once we've 4253 * checked conn_same_as_last_v6 to handle concurrent 4254 * send* calls on a socket. 4255 */ 4256 ixa = conn_get_ixa(connp, B_FALSE); 4257 if (ixa == NULL) { 4258 error = ENOMEM; 4259 goto ud_error2; 4260 } 4261 mutex_enter(&connp->conn_lock); 4262 4263 if (conn_same_as_last_v6(connp, sin6) && 4264 connp->conn_lastsrcid == srcid && 4265 ipsec_outbound_policy_current(ixa)) { 4266 /* icmp_output_lastdst drops conn_lock */ 4267 error = icmp_output_lastdst(connp, data_mp, cr, 4268 pid, ixa); 4269 } else { 4270 /* icmp_output_newdst drops conn_lock */ 4271 error = icmp_output_newdst(connp, data_mp, NULL, 4272 sin6, cr, pid, ixa); 4273 } 4274 ASSERT(MUTEX_NOT_HELD(&connp->conn_lock)); 4275 } 4276 if (error == 0) { 4277 freeb(mp); 4278 return (0); 4279 } 4280 break; 4281 4282 case AF_INET: 4283 sin = (sin_t *)addr; 4284 if ((!OK_32PTR((char *)sin) || addrlen != sizeof (sin_t)) || 4285 (sin->sin_family != AF_INET)) { 4286 error = EADDRNOTAVAIL; 4287 goto ud_error2; 4288 } 4289 if (sin->sin_addr.s_addr == INADDR_ANY) 4290 sin->sin_addr.s_addr = htonl(INADDR_LOOPBACK); 4291 4292 /* Protocol 255 contains full IP headers */ 4293 /* Read without holding lock */ 4294 if (icmp->icmp_hdrincl) { 4295 if (MBLKL(data_mp) < IP_SIMPLE_HDR_LENGTH) { 4296 if (!pullupmsg(data_mp, IP_SIMPLE_HDR_LENGTH)) { 4297 error = EINVAL; 4298 goto ud_error2; 4299 } 4300 } 4301 error = icmp_output_hdrincl(connp, data_mp, cr, pid); 4302 if (error == 0) { 4303 freeb(mp); 4304 return (0); 4305 } 4306 /* data_mp consumed above */ 4307 data_mp = NULL; 4308 goto ud_error2; 4309 } 4310 4311 if (tudr->OPT_length != 0) { 4312 /* 4313 * If we are connected then the destination needs to be 4314 * the same as the connected one. 4315 */ 4316 if (icmp->icmp_state == TS_DATA_XFER && 4317 !conn_same_as_last_v4(connp, sin)) { 4318 error = EISCONN; 4319 goto ud_error2; 4320 } 4321 error = icmp_output_ancillary(connp, sin, NULL, 4322 data_mp, mp, NULL, cr, pid); 4323 } else { 4324 ip_xmit_attr_t *ixa; 4325 4326 /* 4327 * We have to allocate an ip_xmit_attr_t before we grab 4328 * conn_lock and we need to hold conn_lock once we've 4329 * checked conn_same_as_last_v4 to handle concurrent 4330 * send* calls on a socket. 4331 */ 4332 ixa = conn_get_ixa(connp, B_FALSE); 4333 if (ixa == NULL) { 4334 error = ENOMEM; 4335 goto ud_error2; 4336 } 4337 mutex_enter(&connp->conn_lock); 4338 4339 if (conn_same_as_last_v4(connp, sin) && 4340 ipsec_outbound_policy_current(ixa)) { 4341 /* icmp_output_lastdst drops conn_lock */ 4342 error = icmp_output_lastdst(connp, data_mp, cr, 4343 pid, ixa); 4344 } else { 4345 /* icmp_output_newdst drops conn_lock */ 4346 error = icmp_output_newdst(connp, data_mp, sin, 4347 NULL, cr, pid, ixa); 4348 } 4349 ASSERT(MUTEX_NOT_HELD(&connp->conn_lock)); 4350 } 4351 if (error == 0) { 4352 freeb(mp); 4353 return (0); 4354 } 4355 break; 4356 } 4357 ASSERT(mp != NULL); 4358 /* mp is freed by the following routine */ 4359 icmp_ud_err(q, mp, (t_scalar_t)error); 4360 return (0); 4361 4362 ud_error2: 4363 BUMP_MIB(&is->is_rawip_mib, rawipOutErrors); 4364 freemsg(data_mp); 4365 ASSERT(mp != NULL); 4366 /* mp is freed by the following routine */ 4367 icmp_ud_err(q, mp, (t_scalar_t)error); 4368 return (0); 4369 } 4370 4371 /* 4372 * Handle the case of the IP address or flow label being different 4373 * for both IPv4 and IPv6. 4374 * 4375 * NOTE: The caller must hold conn_lock and we drop it here. 4376 */ 4377 static int 4378 icmp_output_newdst(conn_t *connp, mblk_t *data_mp, sin_t *sin, sin6_t *sin6, 4379 cred_t *cr, pid_t pid, ip_xmit_attr_t *ixa) 4380 { 4381 icmp_t *icmp = connp->conn_icmp; 4382 icmp_stack_t *is = icmp->icmp_is; 4383 int error; 4384 ip_xmit_attr_t *oldixa; 4385 boolean_t do_ipsec; 4386 uint_t srcid; 4387 uint32_t flowinfo; 4388 in6_addr_t v6src; 4389 in6_addr_t v6dst; 4390 in6_addr_t v6nexthop; 4391 in_port_t dstport; 4392 4393 ASSERT(MUTEX_HELD(&connp->conn_lock)); 4394 ASSERT(ixa != NULL); 4395 4396 /* 4397 * We hold conn_lock across all the use and modifications of 4398 * the conn_lastdst, conn_ixa, and conn_xmit_ipp to ensure that they 4399 * stay consistent. 4400 */ 4401 4402 ASSERT(cr != NULL); 4403 ASSERT(!(ixa->ixa_free_flags & IXA_FREE_CRED)); 4404 ixa->ixa_cred = cr; 4405 ixa->ixa_cpid = pid; 4406 if (is_system_labeled()) { 4407 /* We need to restart with a label based on the cred */ 4408 ip_xmit_attr_restore_tsl(ixa, ixa->ixa_cred); 4409 } 4410 /* 4411 * If we are connected then the destination needs to be the 4412 * same as the connected one, which is not the case here since we 4413 * checked for that above. 4414 */ 4415 if (icmp->icmp_state == TS_DATA_XFER) { 4416 mutex_exit(&connp->conn_lock); 4417 error = EISCONN; 4418 goto ud_error; 4419 } 4420 4421 /* In case previous destination was multicast or multirt */ 4422 ip_attr_newdst(ixa); 4423 4424 /* 4425 * If laddr is unspecified then we look at sin6_src_id. 4426 * We will give precedence to a source address set with IPV6_PKTINFO 4427 * (aka IPPF_ADDR) but that is handled in build_hdrs. However, we don't 4428 * want ip_attr_connect to select a source (since it can fail) when 4429 * IPV6_PKTINFO is specified. 4430 * If this doesn't result in a source address then we get a source 4431 * from ip_attr_connect() below. 4432 */ 4433 v6src = connp->conn_saddr_v6; 4434 if (sin != NULL) { 4435 IN6_IPADDR_TO_V4MAPPED(sin->sin_addr.s_addr, &v6dst); 4436 dstport = sin->sin_port; 4437 flowinfo = 0; 4438 /* Don't bother with ip_srcid_find_id(), but indicate anyway. */ 4439 srcid = 0; 4440 ixa->ixa_flags &= ~IXAF_SCOPEID_SET; 4441 ixa->ixa_flags |= IXAF_IS_IPV4; 4442 } else { 4443 boolean_t v4mapped; 4444 4445 v6dst = sin6->sin6_addr; 4446 dstport = sin6->sin6_port; 4447 flowinfo = sin6->sin6_flowinfo; 4448 srcid = sin6->__sin6_src_id; 4449 if (IN6_IS_ADDR_LINKSCOPE(&v6dst) && sin6->sin6_scope_id != 0) { 4450 ixa->ixa_scopeid = sin6->sin6_scope_id; 4451 ixa->ixa_flags |= IXAF_SCOPEID_SET; 4452 } else { 4453 ixa->ixa_flags &= ~IXAF_SCOPEID_SET; 4454 } 4455 v4mapped = IN6_IS_ADDR_V4MAPPED(&v6dst); 4456 if (v4mapped) 4457 ixa->ixa_flags |= IXAF_IS_IPV4; 4458 else 4459 ixa->ixa_flags &= ~IXAF_IS_IPV4; 4460 if (srcid != 0 && IN6_IS_ADDR_UNSPECIFIED(&v6src)) { 4461 if (!ip_srcid_find_id(srcid, &v6src, IPCL_ZONEID(connp), 4462 v4mapped, connp->conn_netstack)) { 4463 /* Mismatched v4mapped/v6 specified by srcid. */ 4464 mutex_exit(&connp->conn_lock); 4465 error = EADDRNOTAVAIL; 4466 goto ud_error; 4467 } 4468 } 4469 } 4470 /* Handle IP_PKTINFO/IPV6_PKTINFO setting source address. */ 4471 if (connp->conn_xmit_ipp.ipp_fields & IPPF_ADDR) { 4472 ip_pkt_t *ipp = &connp->conn_xmit_ipp; 4473 4474 if (ixa->ixa_flags & IXAF_IS_IPV4) { 4475 if (IN6_IS_ADDR_V4MAPPED(&ipp->ipp_addr)) 4476 v6src = ipp->ipp_addr; 4477 } else { 4478 if (!IN6_IS_ADDR_V4MAPPED(&ipp->ipp_addr)) 4479 v6src = ipp->ipp_addr; 4480 } 4481 } 4482 4483 /* Defer IPsec if it might need to look at ICMP type/code */ 4484 switch (ixa->ixa_protocol) { 4485 case IPPROTO_ICMP: 4486 case IPPROTO_ICMPV6: 4487 do_ipsec = B_FALSE; 4488 break; 4489 default: 4490 do_ipsec = B_TRUE; 4491 } 4492 4493 ip_attr_nexthop(&connp->conn_xmit_ipp, ixa, &v6dst, &v6nexthop); 4494 mutex_exit(&connp->conn_lock); 4495 4496 error = ip_attr_connect(connp, ixa, &v6src, &v6dst, &v6nexthop, dstport, 4497 &v6src, NULL, IPDF_ALLOW_MCBC | IPDF_VERIFY_DST | 4498 (do_ipsec ? IPDF_IPSEC : 0)); 4499 switch (error) { 4500 case 0: 4501 break; 4502 case EADDRNOTAVAIL: 4503 /* 4504 * IXAF_VERIFY_SOURCE tells us to pick a better source. 4505 * Don't have the application see that errno 4506 */ 4507 error = ENETUNREACH; 4508 goto failed; 4509 case ENETDOWN: 4510 /* 4511 * Have !ipif_addr_ready address; drop packet silently 4512 * until we can get applications to not send until we 4513 * are ready. 4514 */ 4515 error = 0; 4516 goto failed; 4517 case EHOSTUNREACH: 4518 case ENETUNREACH: 4519 if (ixa->ixa_ire != NULL) { 4520 /* 4521 * Let conn_ip_output/ire_send_noroute return 4522 * the error and send any local ICMP error. 4523 */ 4524 error = 0; 4525 break; 4526 } 4527 /* FALLTHRU */ 4528 default: 4529 failed: 4530 goto ud_error; 4531 } 4532 4533 mutex_enter(&connp->conn_lock); 4534 /* 4535 * While we dropped the lock some other thread might have connected 4536 * this socket. If so we bail out with EISCONN to ensure that the 4537 * connecting thread is the one that updates conn_ixa, conn_ht_* 4538 * and conn_*last*. 4539 */ 4540 if (icmp->icmp_state == TS_DATA_XFER) { 4541 mutex_exit(&connp->conn_lock); 4542 error = EISCONN; 4543 goto ud_error; 4544 } 4545 4546 /* 4547 * We need to rebuild the headers if 4548 * - we are labeling packets (could be different for different 4549 * destinations) 4550 * - we have a source route (or routing header) since we need to 4551 * massage that to get the pseudo-header checksum 4552 * - a socket option with COA_HEADER_CHANGED has been set which 4553 * set conn_v6lastdst to zero. 4554 * 4555 * Otherwise the prepend function will just update the src, dst, 4556 * and flow label. 4557 */ 4558 if (is_system_labeled()) { 4559 /* TX MLP requires SCM_UCRED and don't have that here */ 4560 if (connp->conn_mlp_type != mlptSingle) { 4561 mutex_exit(&connp->conn_lock); 4562 error = ECONNREFUSED; 4563 goto ud_error; 4564 } 4565 /* 4566 * Check whether Trusted Solaris policy allows communication 4567 * with this host, and pretend that the destination is 4568 * unreachable if not. 4569 * Compute any needed label and place it in ipp_label_v4/v6. 4570 * 4571 * Later conn_build_hdr_template/conn_prepend_hdr takes 4572 * ipp_label_v4/v6 to form the packet. 4573 * 4574 * Tsol note: Since we hold conn_lock we know no other 4575 * thread manipulates conn_xmit_ipp. 4576 */ 4577 error = conn_update_label(connp, ixa, &v6dst, 4578 &connp->conn_xmit_ipp); 4579 if (error != 0) { 4580 mutex_exit(&connp->conn_lock); 4581 goto ud_error; 4582 } 4583 /* Rebuild the header template */ 4584 error = icmp_build_hdr_template(connp, &v6src, &v6dst, 4585 flowinfo); 4586 if (error != 0) { 4587 mutex_exit(&connp->conn_lock); 4588 goto ud_error; 4589 } 4590 } else if (connp->conn_xmit_ipp.ipp_fields & 4591 (IPPF_IPV4_OPTIONS|IPPF_RTHDR) || 4592 IN6_IS_ADDR_UNSPECIFIED(&connp->conn_v6lastdst)) { 4593 /* Rebuild the header template */ 4594 error = icmp_build_hdr_template(connp, &v6src, &v6dst, 4595 flowinfo); 4596 if (error != 0) { 4597 mutex_exit(&connp->conn_lock); 4598 goto ud_error; 4599 } 4600 } else { 4601 /* Simply update the destination address if no source route */ 4602 if (ixa->ixa_flags & IXAF_IS_IPV4) { 4603 ipha_t *ipha = (ipha_t *)connp->conn_ht_iphc; 4604 4605 IN6_V4MAPPED_TO_IPADDR(&v6dst, ipha->ipha_dst); 4606 if (ixa->ixa_flags & IXAF_PMTU_IPV4_DF) { 4607 ipha->ipha_fragment_offset_and_flags |= 4608 IPH_DF_HTONS; 4609 } else { 4610 ipha->ipha_fragment_offset_and_flags &= 4611 ~IPH_DF_HTONS; 4612 } 4613 } else { 4614 ip6_t *ip6h = (ip6_t *)connp->conn_ht_iphc; 4615 ip6h->ip6_dst = v6dst; 4616 } 4617 } 4618 4619 /* 4620 * Remember the dst etc which corresponds to the built header 4621 * template and conn_ixa. 4622 */ 4623 oldixa = conn_replace_ixa(connp, ixa); 4624 connp->conn_v6lastdst = v6dst; 4625 connp->conn_lastflowinfo = flowinfo; 4626 connp->conn_lastscopeid = ixa->ixa_scopeid; 4627 connp->conn_lastsrcid = srcid; 4628 /* Also remember a source to use together with lastdst */ 4629 connp->conn_v6lastsrc = v6src; 4630 4631 data_mp = icmp_prepend_header_template(connp, ixa, data_mp, &v6src, 4632 flowinfo, &error); 4633 4634 /* Done with conn_t */ 4635 mutex_exit(&connp->conn_lock); 4636 ixa_refrele(oldixa); 4637 4638 if (data_mp == NULL) { 4639 ASSERT(error != 0); 4640 goto ud_error; 4641 } 4642 4643 if (!do_ipsec) { 4644 /* Policy might differ for different ICMP type/code */ 4645 data_mp = icmp_output_attach_policy(data_mp, connp, ixa); 4646 if (data_mp == NULL) { 4647 BUMP_MIB(&is->is_rawip_mib, rawipOutErrors); 4648 error = EHOSTUNREACH; /* IPsec policy failure */ 4649 goto done; 4650 } 4651 } 4652 4653 /* We're done. Pass the packet to ip. */ 4654 BUMP_MIB(&is->is_rawip_mib, rawipOutDatagrams); 4655 4656 error = conn_ip_output(data_mp, ixa); 4657 /* No rawipOutErrors if an error since IP increases its error counter */ 4658 switch (error) { 4659 case 0: 4660 break; 4661 case EWOULDBLOCK: 4662 (void) ixa_check_drain_insert(connp, ixa); 4663 error = 0; 4664 break; 4665 case EADDRNOTAVAIL: 4666 /* 4667 * IXAF_VERIFY_SOURCE tells us to pick a better source. 4668 * Don't have the application see that errno 4669 */ 4670 error = ENETUNREACH; 4671 /* FALLTHRU */ 4672 default: 4673 mutex_enter(&connp->conn_lock); 4674 /* 4675 * Clear the source and v6lastdst so we call ip_attr_connect 4676 * for the next packet and try to pick a better source. 4677 */ 4678 if (connp->conn_mcbc_bind) 4679 connp->conn_saddr_v6 = ipv6_all_zeros; 4680 else 4681 connp->conn_saddr_v6 = connp->conn_bound_addr_v6; 4682 connp->conn_v6lastdst = ipv6_all_zeros; 4683 mutex_exit(&connp->conn_lock); 4684 break; 4685 } 4686 done: 4687 ASSERT(!(ixa->ixa_free_flags & IXA_FREE_CRED)); 4688 ixa->ixa_cred = connp->conn_cred; /* Restore */ 4689 ixa->ixa_cpid = connp->conn_cpid; 4690 ixa_refrele(ixa); 4691 return (error); 4692 4693 ud_error: 4694 ASSERT(!(ixa->ixa_free_flags & IXA_FREE_CRED)); 4695 ixa->ixa_cred = connp->conn_cred; /* Restore */ 4696 ixa->ixa_cpid = connp->conn_cpid; 4697 ixa_refrele(ixa); 4698 4699 BUMP_MIB(&is->is_rawip_mib, rawipOutErrors); 4700 freemsg(data_mp); 4701 return (error); 4702 } 4703 4704 /* ARGSUSED */ 4705 static int 4706 icmp_wput_fallback(queue_t *q, mblk_t *mp) 4707 { 4708 #ifdef DEBUG 4709 cmn_err(CE_CONT, "icmp_wput_fallback: Message during fallback \n"); 4710 #endif 4711 freemsg(mp); 4712 return (0); 4713 } 4714 4715 static void 4716 icmp_wput_other(queue_t *q, mblk_t *mp) 4717 { 4718 uchar_t *rptr = mp->b_rptr; 4719 struct iocblk *iocp; 4720 conn_t *connp = Q_TO_CONN(q); 4721 icmp_t *icmp = connp->conn_icmp; 4722 cred_t *cr; 4723 4724 switch (mp->b_datap->db_type) { 4725 case M_PROTO: 4726 case M_PCPROTO: 4727 if (mp->b_wptr - rptr < sizeof (t_scalar_t)) { 4728 /* 4729 * If the message does not contain a PRIM_type, 4730 * throw it away. 4731 */ 4732 freemsg(mp); 4733 return; 4734 } 4735 switch (((t_primp_t)rptr)->type) { 4736 case T_ADDR_REQ: 4737 icmp_addr_req(q, mp); 4738 return; 4739 case O_T_BIND_REQ: 4740 case T_BIND_REQ: 4741 icmp_tpi_bind(q, mp); 4742 return; 4743 case T_CONN_REQ: 4744 icmp_tpi_connect(q, mp); 4745 return; 4746 case T_CAPABILITY_REQ: 4747 icmp_capability_req(q, mp); 4748 return; 4749 case T_INFO_REQ: 4750 icmp_info_req(q, mp); 4751 return; 4752 case T_UNITDATA_REQ: 4753 /* 4754 * If a T_UNITDATA_REQ gets here, the address must 4755 * be bad. Valid T_UNITDATA_REQs are handled 4756 * in icmp_wput. 4757 */ 4758 icmp_ud_err(q, mp, EADDRNOTAVAIL); 4759 return; 4760 case T_UNBIND_REQ: 4761 icmp_tpi_unbind(q, mp); 4762 return; 4763 case T_SVR4_OPTMGMT_REQ: 4764 /* 4765 * All Solaris components should pass a db_credp 4766 * for this TPI message, hence we ASSERT. 4767 * But in case there is some other M_PROTO that looks 4768 * like a TPI message sent by some other kernel 4769 * component, we check and return an error. 4770 */ 4771 cr = msg_getcred(mp, NULL); 4772 ASSERT(cr != NULL); 4773 if (cr == NULL) { 4774 icmp_err_ack(q, mp, TSYSERR, EINVAL); 4775 return; 4776 } 4777 4778 if (!snmpcom_req(q, mp, icmp_snmp_set, ip_snmp_get, 4779 cr)) { 4780 svr4_optcom_req(q, mp, cr, &icmp_opt_obj); 4781 } 4782 return; 4783 4784 case T_OPTMGMT_REQ: 4785 /* 4786 * All Solaris components should pass a db_credp 4787 * for this TPI message, hence we ASSERT. 4788 * But in case there is some other M_PROTO that looks 4789 * like a TPI message sent by some other kernel 4790 * component, we check and return an error. 4791 */ 4792 cr = msg_getcred(mp, NULL); 4793 ASSERT(cr != NULL); 4794 if (cr == NULL) { 4795 icmp_err_ack(q, mp, TSYSERR, EINVAL); 4796 return; 4797 } 4798 tpi_optcom_req(q, mp, cr, &icmp_opt_obj); 4799 return; 4800 4801 case T_DISCON_REQ: 4802 icmp_tpi_disconnect(q, mp); 4803 return; 4804 4805 /* The following TPI message is not supported by icmp. */ 4806 case O_T_CONN_RES: 4807 case T_CONN_RES: 4808 icmp_err_ack(q, mp, TNOTSUPPORT, 0); 4809 return; 4810 4811 /* The following 3 TPI requests are illegal for icmp. */ 4812 case T_DATA_REQ: 4813 case T_EXDATA_REQ: 4814 case T_ORDREL_REQ: 4815 icmp_err_ack(q, mp, TNOTSUPPORT, 0); 4816 return; 4817 default: 4818 break; 4819 } 4820 break; 4821 case M_FLUSH: 4822 if (*rptr & FLUSHW) 4823 flushq(q, FLUSHDATA); 4824 break; 4825 case M_IOCTL: 4826 iocp = (struct iocblk *)mp->b_rptr; 4827 switch (iocp->ioc_cmd) { 4828 case TI_GETPEERNAME: 4829 if (icmp->icmp_state != TS_DATA_XFER) { 4830 /* 4831 * If a default destination address has not 4832 * been associated with the stream, then we 4833 * don't know the peer's name. 4834 */ 4835 iocp->ioc_error = ENOTCONN; 4836 iocp->ioc_count = 0; 4837 mp->b_datap->db_type = M_IOCACK; 4838 qreply(q, mp); 4839 return; 4840 } 4841 /* FALLTHRU */ 4842 case TI_GETMYNAME: 4843 /* 4844 * For TI_GETPEERNAME and TI_GETMYNAME, we first 4845 * need to copyin the user's strbuf structure. 4846 * Processing will continue in the M_IOCDATA case 4847 * below. 4848 */ 4849 mi_copyin(q, mp, NULL, 4850 SIZEOF_STRUCT(strbuf, iocp->ioc_flag)); 4851 return; 4852 default: 4853 break; 4854 } 4855 break; 4856 case M_IOCDATA: 4857 icmp_wput_iocdata(q, mp); 4858 return; 4859 default: 4860 /* Unrecognized messages are passed through without change. */ 4861 break; 4862 } 4863 ip_wput_nondata(q, mp); 4864 } 4865 4866 /* 4867 * icmp_wput_iocdata is called by icmp_wput_other to handle all M_IOCDATA 4868 * messages. 4869 */ 4870 static void 4871 icmp_wput_iocdata(queue_t *q, mblk_t *mp) 4872 { 4873 mblk_t *mp1; 4874 STRUCT_HANDLE(strbuf, sb); 4875 uint_t addrlen; 4876 conn_t *connp = Q_TO_CONN(q); 4877 icmp_t *icmp = connp->conn_icmp; 4878 4879 /* Make sure it is one of ours. */ 4880 switch (((struct iocblk *)mp->b_rptr)->ioc_cmd) { 4881 case TI_GETMYNAME: 4882 case TI_GETPEERNAME: 4883 break; 4884 default: 4885 ip_wput_nondata(q, mp); 4886 return; 4887 } 4888 4889 switch (mi_copy_state(q, mp, &mp1)) { 4890 case -1: 4891 return; 4892 case MI_COPY_CASE(MI_COPY_IN, 1): 4893 break; 4894 case MI_COPY_CASE(MI_COPY_OUT, 1): 4895 /* 4896 * The address has been copied out, so now 4897 * copyout the strbuf. 4898 */ 4899 mi_copyout(q, mp); 4900 return; 4901 case MI_COPY_CASE(MI_COPY_OUT, 2): 4902 /* 4903 * The address and strbuf have been copied out. 4904 * We're done, so just acknowledge the original 4905 * M_IOCTL. 4906 */ 4907 mi_copy_done(q, mp, 0); 4908 return; 4909 default: 4910 /* 4911 * Something strange has happened, so acknowledge 4912 * the original M_IOCTL with an EPROTO error. 4913 */ 4914 mi_copy_done(q, mp, EPROTO); 4915 return; 4916 } 4917 4918 /* 4919 * Now we have the strbuf structure for TI_GETMYNAME 4920 * and TI_GETPEERNAME. Next we copyout the requested 4921 * address and then we'll copyout the strbuf. 4922 */ 4923 STRUCT_SET_HANDLE(sb, ((struct iocblk *)mp->b_rptr)->ioc_flag, 4924 (void *)mp1->b_rptr); 4925 4926 if (connp->conn_family == AF_INET) 4927 addrlen = sizeof (sin_t); 4928 else 4929 addrlen = sizeof (sin6_t); 4930 4931 if (STRUCT_FGET(sb, maxlen) < addrlen) { 4932 mi_copy_done(q, mp, EINVAL); 4933 return; 4934 } 4935 switch (((struct iocblk *)mp->b_rptr)->ioc_cmd) { 4936 case TI_GETMYNAME: 4937 break; 4938 case TI_GETPEERNAME: 4939 if (icmp->icmp_state != TS_DATA_XFER) { 4940 mi_copy_done(q, mp, ENOTCONN); 4941 return; 4942 } 4943 break; 4944 default: 4945 mi_copy_done(q, mp, EPROTO); 4946 return; 4947 } 4948 mp1 = mi_copyout_alloc(q, mp, STRUCT_FGETP(sb, buf), addrlen, B_TRUE); 4949 if (!mp1) 4950 return; 4951 4952 STRUCT_FSET(sb, len, addrlen); 4953 switch (((struct iocblk *)mp->b_rptr)->ioc_cmd) { 4954 case TI_GETMYNAME: 4955 (void) conn_getsockname(connp, (struct sockaddr *)mp1->b_wptr, 4956 &addrlen); 4957 break; 4958 case TI_GETPEERNAME: 4959 (void) conn_getpeername(connp, (struct sockaddr *)mp1->b_wptr, 4960 &addrlen); 4961 break; 4962 } 4963 mp1->b_wptr += addrlen; 4964 /* Copy out the address */ 4965 mi_copyout(q, mp); 4966 } 4967 4968 void 4969 icmp_ddi_g_init(void) 4970 { 4971 icmp_max_optsize = optcom_max_optsize(icmp_opt_obj.odb_opt_des_arr, 4972 icmp_opt_obj.odb_opt_arr_cnt); 4973 4974 /* 4975 * We want to be informed each time a stack is created or 4976 * destroyed in the kernel, so we can maintain the 4977 * set of icmp_stack_t's. 4978 */ 4979 netstack_register(NS_ICMP, rawip_stack_init, NULL, rawip_stack_fini); 4980 } 4981 4982 void 4983 icmp_ddi_g_destroy(void) 4984 { 4985 netstack_unregister(NS_ICMP); 4986 } 4987 4988 #define INET_NAME "ip" 4989 4990 /* 4991 * Initialize the ICMP stack instance. 4992 */ 4993 static void * 4994 rawip_stack_init(netstackid_t stackid, netstack_t *ns) 4995 { 4996 icmp_stack_t *is; 4997 int error = 0; 4998 size_t arrsz; 4999 major_t major; 5000 5001 is = (icmp_stack_t *)kmem_zalloc(sizeof (*is), KM_SLEEP); 5002 is->is_netstack = ns; 5003 5004 arrsz = sizeof (icmp_propinfo_tbl); 5005 is->is_propinfo_tbl = (mod_prop_info_t *)kmem_alloc(arrsz, KM_SLEEP); 5006 bcopy(icmp_propinfo_tbl, is->is_propinfo_tbl, arrsz); 5007 5008 is->is_ksp = rawip_kstat_init(stackid); 5009 5010 major = mod_name_to_major(INET_NAME); 5011 error = ldi_ident_from_major(major, &is->is_ldi_ident); 5012 ASSERT(error == 0); 5013 return (is); 5014 } 5015 5016 /* 5017 * Free the ICMP stack instance. 5018 */ 5019 static void 5020 rawip_stack_fini(netstackid_t stackid, void *arg) 5021 { 5022 icmp_stack_t *is = (icmp_stack_t *)arg; 5023 5024 kmem_free(is->is_propinfo_tbl, sizeof (icmp_propinfo_tbl)); 5025 is->is_propinfo_tbl = NULL; 5026 5027 rawip_kstat_fini(stackid, is->is_ksp); 5028 is->is_ksp = NULL; 5029 ldi_ident_release(is->is_ldi_ident); 5030 kmem_free(is, sizeof (*is)); 5031 } 5032 5033 static void * 5034 rawip_kstat_init(netstackid_t stackid) 5035 { 5036 kstat_t *ksp; 5037 5038 rawip_named_kstat_t template = { 5039 { "inDatagrams", KSTAT_DATA_UINT32, 0 }, 5040 { "inCksumErrs", KSTAT_DATA_UINT32, 0 }, 5041 { "inErrors", KSTAT_DATA_UINT32, 0 }, 5042 { "outDatagrams", KSTAT_DATA_UINT32, 0 }, 5043 { "outErrors", KSTAT_DATA_UINT32, 0 }, 5044 }; 5045 5046 ksp = kstat_create_netstack("icmp", 0, "rawip", "mib2", 5047 KSTAT_TYPE_NAMED, NUM_OF_FIELDS(rawip_named_kstat_t), 0, stackid); 5048 if (ksp == NULL || ksp->ks_data == NULL) 5049 return (NULL); 5050 5051 bcopy(&template, ksp->ks_data, sizeof (template)); 5052 ksp->ks_update = rawip_kstat_update; 5053 ksp->ks_private = (void *)(uintptr_t)stackid; 5054 5055 kstat_install(ksp); 5056 return (ksp); 5057 } 5058 5059 static void 5060 rawip_kstat_fini(netstackid_t stackid, kstat_t *ksp) 5061 { 5062 if (ksp != NULL) { 5063 ASSERT(stackid == (netstackid_t)(uintptr_t)ksp->ks_private); 5064 kstat_delete_netstack(ksp, stackid); 5065 } 5066 } 5067 5068 static int 5069 rawip_kstat_update(kstat_t *ksp, int rw) 5070 { 5071 rawip_named_kstat_t *rawipkp; 5072 netstackid_t stackid = (netstackid_t)(uintptr_t)ksp->ks_private; 5073 netstack_t *ns; 5074 icmp_stack_t *is; 5075 5076 if (ksp->ks_data == NULL) 5077 return (EIO); 5078 5079 if (rw == KSTAT_WRITE) 5080 return (EACCES); 5081 5082 rawipkp = (rawip_named_kstat_t *)ksp->ks_data; 5083 5084 ns = netstack_find_by_stackid(stackid); 5085 if (ns == NULL) 5086 return (-1); 5087 is = ns->netstack_icmp; 5088 if (is == NULL) { 5089 netstack_rele(ns); 5090 return (-1); 5091 } 5092 rawipkp->inDatagrams.value.ui32 = is->is_rawip_mib.rawipInDatagrams; 5093 rawipkp->inCksumErrs.value.ui32 = is->is_rawip_mib.rawipInCksumErrs; 5094 rawipkp->inErrors.value.ui32 = is->is_rawip_mib.rawipInErrors; 5095 rawipkp->outDatagrams.value.ui32 = is->is_rawip_mib.rawipOutDatagrams; 5096 rawipkp->outErrors.value.ui32 = is->is_rawip_mib.rawipOutErrors; 5097 netstack_rele(ns); 5098 return (0); 5099 } 5100 5101 /* ARGSUSED */ 5102 int 5103 rawip_accept(sock_lower_handle_t lproto_handle, 5104 sock_lower_handle_t eproto_handle, sock_upper_handle_t sock_handle, 5105 cred_t *cr) 5106 { 5107 return (EOPNOTSUPP); 5108 } 5109 5110 /* ARGSUSED */ 5111 int 5112 rawip_bind(sock_lower_handle_t proto_handle, struct sockaddr *sa, 5113 socklen_t len, cred_t *cr) 5114 { 5115 conn_t *connp = (conn_t *)proto_handle; 5116 int error; 5117 5118 /* All Solaris components should pass a cred for this operation. */ 5119 ASSERT(cr != NULL); 5120 5121 /* Binding to a NULL address really means unbind */ 5122 if (sa == NULL) 5123 error = rawip_do_unbind(connp); 5124 else 5125 error = rawip_do_bind(connp, sa, len); 5126 5127 if (error < 0) { 5128 if (error == -TOUTSTATE) 5129 error = EINVAL; 5130 else 5131 error = proto_tlitosyserr(-error); 5132 } 5133 return (error); 5134 } 5135 5136 static int 5137 rawip_implicit_bind(conn_t *connp) 5138 { 5139 sin6_t sin6addr; 5140 sin_t *sin; 5141 sin6_t *sin6; 5142 socklen_t len; 5143 int error; 5144 5145 if (connp->conn_family == AF_INET) { 5146 len = sizeof (struct sockaddr_in); 5147 sin = (sin_t *)&sin6addr; 5148 *sin = sin_null; 5149 sin->sin_family = AF_INET; 5150 sin->sin_addr.s_addr = INADDR_ANY; 5151 } else { 5152 ASSERT(connp->conn_family == AF_INET6); 5153 len = sizeof (sin6_t); 5154 sin6 = (sin6_t *)&sin6addr; 5155 *sin6 = sin6_null; 5156 sin6->sin6_family = AF_INET6; 5157 V6_SET_ZERO(sin6->sin6_addr); 5158 } 5159 5160 error = rawip_do_bind(connp, (struct sockaddr *)&sin6addr, len); 5161 5162 return ((error < 0) ? proto_tlitosyserr(-error) : error); 5163 } 5164 5165 static int 5166 rawip_unbind(conn_t *connp) 5167 { 5168 int error; 5169 5170 error = rawip_do_unbind(connp); 5171 if (error < 0) { 5172 error = proto_tlitosyserr(-error); 5173 } 5174 return (error); 5175 } 5176 5177 /* ARGSUSED */ 5178 int 5179 rawip_listen(sock_lower_handle_t proto_handle, int backlog, cred_t *cr) 5180 { 5181 return (EOPNOTSUPP); 5182 } 5183 5184 int 5185 rawip_connect(sock_lower_handle_t proto_handle, const struct sockaddr *sa, 5186 socklen_t len, sock_connid_t *id, cred_t *cr) 5187 { 5188 conn_t *connp = (conn_t *)proto_handle; 5189 icmp_t *icmp = connp->conn_icmp; 5190 int error; 5191 boolean_t did_bind = B_FALSE; 5192 pid_t pid = curproc->p_pid; 5193 5194 /* All Solaris components should pass a cred for this operation. */ 5195 ASSERT(cr != NULL); 5196 5197 if (sa == NULL) { 5198 /* 5199 * Disconnect 5200 * Make sure we are connected 5201 */ 5202 if (icmp->icmp_state != TS_DATA_XFER) 5203 return (EINVAL); 5204 5205 error = icmp_disconnect(connp); 5206 return (error); 5207 } 5208 5209 error = proto_verify_ip_addr(connp->conn_family, sa, len); 5210 if (error != 0) 5211 return (error); 5212 5213 /* do an implicit bind if necessary */ 5214 if (icmp->icmp_state == TS_UNBND) { 5215 error = rawip_implicit_bind(connp); 5216 /* 5217 * We could be racing with an actual bind, in which case 5218 * we would see EPROTO. We cross our fingers and try 5219 * to connect. 5220 */ 5221 if (!(error == 0 || error == EPROTO)) 5222 return (error); 5223 did_bind = B_TRUE; 5224 } 5225 5226 /* 5227 * set SO_DGRAM_ERRIND 5228 */ 5229 connp->conn_dgram_errind = B_TRUE; 5230 5231 error = rawip_do_connect(connp, sa, len, cr, pid); 5232 if (error != 0 && did_bind) { 5233 int unbind_err; 5234 5235 unbind_err = rawip_unbind(connp); 5236 ASSERT(unbind_err == 0); 5237 } 5238 5239 if (error == 0) { 5240 *id = 0; 5241 (*connp->conn_upcalls->su_connected)(connp->conn_upper_handle, 5242 0, NULL, -1); 5243 } else if (error < 0) { 5244 error = proto_tlitosyserr(-error); 5245 } 5246 return (error); 5247 } 5248 5249 /* ARGSUSED2 */ 5250 int 5251 rawip_fallback(sock_lower_handle_t proto_handle, queue_t *q, 5252 boolean_t direct_sockfs, so_proto_quiesced_cb_t quiesced_cb, 5253 sock_quiesce_arg_t *arg) 5254 { 5255 conn_t *connp = (conn_t *)proto_handle; 5256 icmp_t *icmp; 5257 struct T_capability_ack tca; 5258 struct sockaddr_in6 laddr, faddr; 5259 socklen_t laddrlen, faddrlen; 5260 short opts; 5261 struct stroptions *stropt; 5262 mblk_t *mp, *stropt_mp; 5263 int error; 5264 5265 icmp = connp->conn_icmp; 5266 5267 stropt_mp = allocb_wait(sizeof (*stropt), BPRI_HI, STR_NOSIG, NULL); 5268 5269 /* 5270 * setup the fallback stream that was allocated 5271 */ 5272 connp->conn_dev = (dev_t)RD(q)->q_ptr; 5273 connp->conn_minor_arena = WR(q)->q_ptr; 5274 5275 RD(q)->q_ptr = WR(q)->q_ptr = connp; 5276 5277 WR(q)->q_qinfo = &icmpwinit; 5278 5279 connp->conn_rq = RD(q); 5280 connp->conn_wq = WR(q); 5281 5282 /* Notify stream head about options before sending up data */ 5283 stropt_mp->b_datap->db_type = M_SETOPTS; 5284 stropt_mp->b_wptr += sizeof (*stropt); 5285 stropt = (struct stroptions *)stropt_mp->b_rptr; 5286 stropt->so_flags = SO_WROFF | SO_HIWAT; 5287 stropt->so_wroff = connp->conn_wroff; 5288 stropt->so_hiwat = connp->conn_rcvbuf; 5289 putnext(RD(q), stropt_mp); 5290 5291 /* 5292 * free helper stream 5293 */ 5294 ip_free_helper_stream(connp); 5295 5296 /* 5297 * Collect the information needed to sync with the sonode 5298 */ 5299 icmp_do_capability_ack(icmp, &tca, TC1_INFO); 5300 5301 laddrlen = faddrlen = sizeof (sin6_t); 5302 (void) rawip_getsockname((sock_lower_handle_t)connp, 5303 (struct sockaddr *)&laddr, &laddrlen, CRED()); 5304 error = rawip_getpeername((sock_lower_handle_t)connp, 5305 (struct sockaddr *)&faddr, &faddrlen, CRED()); 5306 if (error != 0) 5307 faddrlen = 0; 5308 opts = 0; 5309 if (connp->conn_dgram_errind) 5310 opts |= SO_DGRAM_ERRIND; 5311 if (connp->conn_ixa->ixa_flags & IXAF_DONTROUTE) 5312 opts |= SO_DONTROUTE; 5313 5314 mp = (*quiesced_cb)(connp->conn_upper_handle, arg, &tca, 5315 (struct sockaddr *)&laddr, laddrlen, 5316 (struct sockaddr *)&faddr, faddrlen, opts); 5317 5318 /* 5319 * Attempts to send data up during fallback will result in it being 5320 * queued in icmp_t. Now we push up any queued packets. 5321 */ 5322 mutex_enter(&icmp->icmp_recv_lock); 5323 if (mp != NULL) { 5324 mp->b_next = icmp->icmp_fallback_queue_head; 5325 icmp->icmp_fallback_queue_head = mp; 5326 } 5327 while (icmp->icmp_fallback_queue_head != NULL) { 5328 mp = icmp->icmp_fallback_queue_head; 5329 icmp->icmp_fallback_queue_head = mp->b_next; 5330 mp->b_next = NULL; 5331 mutex_exit(&icmp->icmp_recv_lock); 5332 putnext(RD(q), mp); 5333 mutex_enter(&icmp->icmp_recv_lock); 5334 } 5335 icmp->icmp_fallback_queue_tail = icmp->icmp_fallback_queue_head; 5336 5337 /* 5338 * No longer a streams less socket 5339 */ 5340 mutex_enter(&connp->conn_lock); 5341 connp->conn_flags &= ~IPCL_NONSTR; 5342 mutex_exit(&connp->conn_lock); 5343 5344 mutex_exit(&icmp->icmp_recv_lock); 5345 5346 ASSERT(icmp->icmp_fallback_queue_head == NULL && 5347 icmp->icmp_fallback_queue_tail == NULL); 5348 5349 ASSERT(connp->conn_ref >= 1); 5350 5351 return (0); 5352 } 5353 5354 /* ARGSUSED2 */ 5355 sock_lower_handle_t 5356 rawip_create(int family, int type, int proto, sock_downcalls_t **sock_downcalls, 5357 uint_t *smodep, int *errorp, int flags, cred_t *credp) 5358 { 5359 conn_t *connp; 5360 5361 if (type != SOCK_RAW || (family != AF_INET && family != AF_INET6)) { 5362 *errorp = EPROTONOSUPPORT; 5363 return (NULL); 5364 } 5365 5366 connp = rawip_do_open(family, credp, errorp, flags); 5367 if (connp != NULL) { 5368 connp->conn_flags |= IPCL_NONSTR; 5369 5370 mutex_enter(&connp->conn_lock); 5371 connp->conn_state_flags &= ~CONN_INCIPIENT; 5372 mutex_exit(&connp->conn_lock); 5373 *sock_downcalls = &sock_rawip_downcalls; 5374 *smodep = SM_ATOMIC; 5375 } else { 5376 ASSERT(*errorp != 0); 5377 } 5378 5379 return ((sock_lower_handle_t)connp); 5380 } 5381 5382 /* ARGSUSED3 */ 5383 void 5384 rawip_activate(sock_lower_handle_t proto_handle, 5385 sock_upper_handle_t sock_handle, sock_upcalls_t *sock_upcalls, int flags, 5386 cred_t *cr) 5387 { 5388 conn_t *connp = (conn_t *)proto_handle; 5389 struct sock_proto_props sopp; 5390 5391 /* All Solaris components should pass a cred for this operation. */ 5392 ASSERT(cr != NULL); 5393 5394 connp->conn_upcalls = sock_upcalls; 5395 connp->conn_upper_handle = sock_handle; 5396 5397 sopp.sopp_flags = SOCKOPT_WROFF | SOCKOPT_RCVHIWAT | SOCKOPT_RCVLOWAT | 5398 SOCKOPT_MAXBLK | SOCKOPT_MAXPSZ | SOCKOPT_MINPSZ; 5399 sopp.sopp_wroff = connp->conn_wroff; 5400 sopp.sopp_rxhiwat = connp->conn_rcvbuf; 5401 sopp.sopp_rxlowat = connp->conn_rcvlowat; 5402 sopp.sopp_maxblk = INFPSZ; 5403 sopp.sopp_maxpsz = IP_MAXPACKET; 5404 sopp.sopp_minpsz = (icmp_mod_info.mi_minpsz == 1) ? 0 : 5405 icmp_mod_info.mi_minpsz; 5406 5407 (*connp->conn_upcalls->su_set_proto_props) 5408 (connp->conn_upper_handle, &sopp); 5409 5410 icmp_bind_proto(connp->conn_icmp); 5411 } 5412 5413 /* ARGSUSED3 */ 5414 int 5415 rawip_getpeername(sock_lower_handle_t proto_handle, struct sockaddr *sa, 5416 socklen_t *salenp, cred_t *cr) 5417 { 5418 conn_t *connp = (conn_t *)proto_handle; 5419 icmp_t *icmp = connp->conn_icmp; 5420 int error; 5421 5422 /* All Solaris components should pass a cred for this operation. */ 5423 ASSERT(cr != NULL); 5424 5425 mutex_enter(&connp->conn_lock); 5426 if (icmp->icmp_state != TS_DATA_XFER) 5427 error = ENOTCONN; 5428 else 5429 error = conn_getpeername(connp, sa, salenp); 5430 mutex_exit(&connp->conn_lock); 5431 return (error); 5432 } 5433 5434 /* ARGSUSED3 */ 5435 int 5436 rawip_getsockname(sock_lower_handle_t proto_handle, struct sockaddr *sa, 5437 socklen_t *salenp, cred_t *cr) 5438 { 5439 conn_t *connp = (conn_t *)proto_handle; 5440 int error; 5441 5442 /* All Solaris components should pass a cred for this operation. */ 5443 ASSERT(cr != NULL); 5444 5445 mutex_enter(&connp->conn_lock); 5446 error = conn_getsockname(connp, sa, salenp); 5447 mutex_exit(&connp->conn_lock); 5448 return (error); 5449 } 5450 5451 int 5452 rawip_setsockopt(sock_lower_handle_t proto_handle, int level, int option_name, 5453 const void *optvalp, socklen_t optlen, cred_t *cr) 5454 { 5455 conn_t *connp = (conn_t *)proto_handle; 5456 int error; 5457 5458 /* All Solaris components should pass a cred for this operation. */ 5459 ASSERT(cr != NULL); 5460 5461 error = proto_opt_check(level, option_name, optlen, NULL, 5462 icmp_opt_obj.odb_opt_des_arr, 5463 icmp_opt_obj.odb_opt_arr_cnt, 5464 B_TRUE, B_FALSE, cr); 5465 5466 if (error != 0) { 5467 /* 5468 * option not recognized 5469 */ 5470 if (error < 0) { 5471 error = proto_tlitosyserr(-error); 5472 } 5473 return (error); 5474 } 5475 5476 error = icmp_opt_set(connp, SETFN_OPTCOM_NEGOTIATE, level, 5477 option_name, optlen, (uchar_t *)optvalp, (uint_t *)&optlen, 5478 (uchar_t *)optvalp, NULL, cr); 5479 5480 ASSERT(error >= 0); 5481 5482 return (error); 5483 } 5484 5485 int 5486 rawip_getsockopt(sock_lower_handle_t proto_handle, int level, int option_name, 5487 void *optvalp, socklen_t *optlen, cred_t *cr) 5488 { 5489 int error; 5490 conn_t *connp = (conn_t *)proto_handle; 5491 t_uscalar_t max_optbuf_len; 5492 void *optvalp_buf; 5493 int len; 5494 5495 /* All Solaris components should pass a cred for this operation. */ 5496 ASSERT(cr != NULL); 5497 5498 error = proto_opt_check(level, option_name, *optlen, &max_optbuf_len, 5499 icmp_opt_obj.odb_opt_des_arr, 5500 icmp_opt_obj.odb_opt_arr_cnt, 5501 B_FALSE, B_TRUE, cr); 5502 5503 if (error != 0) { 5504 if (error < 0) { 5505 error = proto_tlitosyserr(-error); 5506 } 5507 return (error); 5508 } 5509 5510 optvalp_buf = kmem_alloc(max_optbuf_len, KM_SLEEP); 5511 len = icmp_opt_get(connp, level, option_name, optvalp_buf); 5512 if (len == -1) { 5513 kmem_free(optvalp_buf, max_optbuf_len); 5514 return (EINVAL); 5515 } 5516 5517 /* 5518 * update optlen and copy option value 5519 */ 5520 t_uscalar_t size = MIN(len, *optlen); 5521 5522 bcopy(optvalp_buf, optvalp, size); 5523 bcopy(&size, optlen, sizeof (size)); 5524 5525 kmem_free(optvalp_buf, max_optbuf_len); 5526 return (0); 5527 } 5528 5529 /* ARGSUSED1 */ 5530 int 5531 rawip_close(sock_lower_handle_t proto_handle, int flags, cred_t *cr) 5532 { 5533 conn_t *connp = (conn_t *)proto_handle; 5534 5535 /* All Solaris components should pass a cred for this operation. */ 5536 ASSERT(cr != NULL); 5537 5538 (void) rawip_do_close(connp); 5539 return (0); 5540 } 5541 5542 /* ARGSUSED2 */ 5543 int 5544 rawip_shutdown(sock_lower_handle_t proto_handle, int how, cred_t *cr) 5545 { 5546 conn_t *connp = (conn_t *)proto_handle; 5547 5548 /* All Solaris components should pass a cred for this operation. */ 5549 ASSERT(cr != NULL); 5550 5551 /* shut down the send side */ 5552 if (how != SHUT_RD) 5553 (*connp->conn_upcalls->su_opctl)(connp->conn_upper_handle, 5554 SOCK_OPCTL_SHUT_SEND, 0); 5555 /* shut down the recv side */ 5556 if (how != SHUT_WR) 5557 (*connp->conn_upcalls->su_opctl)(connp->conn_upper_handle, 5558 SOCK_OPCTL_SHUT_RECV, 0); 5559 return (0); 5560 } 5561 5562 void 5563 rawip_clr_flowctrl(sock_lower_handle_t proto_handle) 5564 { 5565 conn_t *connp = (conn_t *)proto_handle; 5566 icmp_t *icmp = connp->conn_icmp; 5567 5568 mutex_enter(&icmp->icmp_recv_lock); 5569 connp->conn_flow_cntrld = B_FALSE; 5570 mutex_exit(&icmp->icmp_recv_lock); 5571 } 5572 5573 int 5574 rawip_ioctl(sock_lower_handle_t proto_handle, int cmd, intptr_t arg, 5575 int mode, int32_t *rvalp, cred_t *cr) 5576 { 5577 conn_t *connp = (conn_t *)proto_handle; 5578 int error; 5579 5580 /* All Solaris components should pass a cred for this operation. */ 5581 ASSERT(cr != NULL); 5582 5583 /* 5584 * If we don't have a helper stream then create one. 5585 * ip_create_helper_stream takes care of locking the conn_t, 5586 * so this check for NULL is just a performance optimization. 5587 */ 5588 if (connp->conn_helper_info == NULL) { 5589 icmp_stack_t *is = connp->conn_icmp->icmp_is; 5590 5591 ASSERT(is->is_ldi_ident != NULL); 5592 5593 /* 5594 * Create a helper stream for non-STREAMS socket. 5595 */ 5596 error = ip_create_helper_stream(connp, is->is_ldi_ident); 5597 if (error != 0) { 5598 ip0dbg(("rawip_ioctl: create of IP helper stream " 5599 "failed %d\n", error)); 5600 return (error); 5601 } 5602 } 5603 5604 switch (cmd) { 5605 case _SIOCSOCKFALLBACK: 5606 case TI_GETPEERNAME: 5607 case TI_GETMYNAME: 5608 #ifdef DEBUG 5609 cmn_err(CE_CONT, "icmp_ioctl cmd 0x%x on non streams" 5610 " socket", cmd); 5611 #endif 5612 error = EINVAL; 5613 break; 5614 default: 5615 /* 5616 * Pass on to IP using helper stream 5617 */ 5618 error = ldi_ioctl(connp->conn_helper_info->iphs_handle, 5619 cmd, arg, mode, cr, rvalp); 5620 break; 5621 } 5622 return (error); 5623 } 5624 5625 int 5626 rawip_send(sock_lower_handle_t proto_handle, mblk_t *mp, struct nmsghdr *msg, 5627 cred_t *cr) 5628 { 5629 sin6_t *sin6; 5630 sin_t *sin = NULL; 5631 uint_t srcid; 5632 conn_t *connp = (conn_t *)proto_handle; 5633 icmp_t *icmp = connp->conn_icmp; 5634 int error = 0; 5635 icmp_stack_t *is = icmp->icmp_is; 5636 pid_t pid = curproc->p_pid; 5637 ip_xmit_attr_t *ixa; 5638 5639 ASSERT(DB_TYPE(mp) == M_DATA); 5640 5641 /* All Solaris components should pass a cred for this operation. */ 5642 ASSERT(cr != NULL); 5643 5644 /* do an implicit bind if necessary */ 5645 if (icmp->icmp_state == TS_UNBND) { 5646 error = rawip_implicit_bind(connp); 5647 /* 5648 * We could be racing with an actual bind, in which case 5649 * we would see EPROTO. We cross our fingers and try 5650 * to connect. 5651 */ 5652 if (!(error == 0 || error == EPROTO)) { 5653 freemsg(mp); 5654 return (error); 5655 } 5656 } 5657 5658 /* Protocol 255 contains full IP headers */ 5659 /* Read without holding lock */ 5660 if (icmp->icmp_hdrincl) { 5661 ASSERT(connp->conn_ipversion == IPV4_VERSION); 5662 if (mp->b_wptr - mp->b_rptr < IP_SIMPLE_HDR_LENGTH) { 5663 if (!pullupmsg(mp, IP_SIMPLE_HDR_LENGTH)) { 5664 BUMP_MIB(&is->is_rawip_mib, rawipOutErrors); 5665 freemsg(mp); 5666 return (EINVAL); 5667 } 5668 } 5669 error = icmp_output_hdrincl(connp, mp, cr, pid); 5670 if (is->is_sendto_ignerr) 5671 return (0); 5672 else 5673 return (error); 5674 } 5675 5676 /* Connected? */ 5677 if (msg->msg_name == NULL) { 5678 if (icmp->icmp_state != TS_DATA_XFER) { 5679 BUMP_MIB(&is->is_rawip_mib, rawipOutErrors); 5680 return (EDESTADDRREQ); 5681 } 5682 if (msg->msg_controllen != 0) { 5683 error = icmp_output_ancillary(connp, NULL, NULL, mp, 5684 NULL, msg, cr, pid); 5685 } else { 5686 error = icmp_output_connected(connp, mp, cr, pid); 5687 } 5688 if (is->is_sendto_ignerr) 5689 return (0); 5690 else 5691 return (error); 5692 } 5693 if (icmp->icmp_state == TS_DATA_XFER) { 5694 BUMP_MIB(&is->is_rawip_mib, rawipOutErrors); 5695 return (EISCONN); 5696 } 5697 error = proto_verify_ip_addr(connp->conn_family, 5698 (struct sockaddr *)msg->msg_name, msg->msg_namelen); 5699 if (error != 0) { 5700 BUMP_MIB(&is->is_rawip_mib, rawipOutErrors); 5701 return (error); 5702 } 5703 switch (connp->conn_family) { 5704 case AF_INET6: 5705 sin6 = (sin6_t *)msg->msg_name; 5706 5707 /* No support for mapped addresses on raw sockets */ 5708 if (IN6_IS_ADDR_V4MAPPED(&sin6->sin6_addr)) { 5709 BUMP_MIB(&is->is_rawip_mib, rawipOutErrors); 5710 return (EADDRNOTAVAIL); 5711 } 5712 srcid = sin6->__sin6_src_id; 5713 5714 /* 5715 * If the local address is a mapped address return 5716 * an error. 5717 * It would be possible to send an IPv6 packet but the 5718 * response would never make it back to the application 5719 * since it is bound to a mapped address. 5720 */ 5721 if (IN6_IS_ADDR_V4MAPPED(&connp->conn_saddr_v6)) { 5722 BUMP_MIB(&is->is_rawip_mib, rawipOutErrors); 5723 return (EADDRNOTAVAIL); 5724 } 5725 5726 if (IN6_IS_ADDR_UNSPECIFIED(&sin6->sin6_addr)) 5727 sin6->sin6_addr = ipv6_loopback; 5728 5729 /* 5730 * We have to allocate an ip_xmit_attr_t before we grab 5731 * conn_lock and we need to hold conn_lock once we've check 5732 * conn_same_as_last_v6 to handle concurrent send* calls on a 5733 * socket. 5734 */ 5735 if (msg->msg_controllen == 0) { 5736 ixa = conn_get_ixa(connp, B_FALSE); 5737 if (ixa == NULL) { 5738 BUMP_MIB(&is->is_rawip_mib, rawipOutErrors); 5739 return (ENOMEM); 5740 } 5741 } else { 5742 ixa = NULL; 5743 } 5744 mutex_enter(&connp->conn_lock); 5745 if (icmp->icmp_delayed_error != 0) { 5746 sin6_t *sin2 = (sin6_t *)&icmp->icmp_delayed_addr; 5747 5748 error = icmp->icmp_delayed_error; 5749 icmp->icmp_delayed_error = 0; 5750 5751 /* Compare IP address and family */ 5752 5753 if (IN6_ARE_ADDR_EQUAL(&sin6->sin6_addr, 5754 &sin2->sin6_addr) && 5755 sin6->sin6_family == sin2->sin6_family) { 5756 mutex_exit(&connp->conn_lock); 5757 BUMP_MIB(&is->is_rawip_mib, rawipOutErrors); 5758 if (ixa != NULL) 5759 ixa_refrele(ixa); 5760 return (error); 5761 } 5762 } 5763 if (msg->msg_controllen != 0) { 5764 mutex_exit(&connp->conn_lock); 5765 ASSERT(ixa == NULL); 5766 error = icmp_output_ancillary(connp, NULL, sin6, mp, 5767 NULL, msg, cr, pid); 5768 } else if (conn_same_as_last_v6(connp, sin6) && 5769 connp->conn_lastsrcid == srcid && 5770 ipsec_outbound_policy_current(ixa)) { 5771 /* icmp_output_lastdst drops conn_lock */ 5772 error = icmp_output_lastdst(connp, mp, cr, pid, ixa); 5773 } else { 5774 /* icmp_output_newdst drops conn_lock */ 5775 error = icmp_output_newdst(connp, mp, NULL, sin6, cr, 5776 pid, ixa); 5777 } 5778 ASSERT(MUTEX_NOT_HELD(&connp->conn_lock)); 5779 if (is->is_sendto_ignerr) 5780 return (0); 5781 else 5782 return (error); 5783 case AF_INET: 5784 sin = (sin_t *)msg->msg_name; 5785 5786 if (sin->sin_addr.s_addr == INADDR_ANY) 5787 sin->sin_addr.s_addr = htonl(INADDR_LOOPBACK); 5788 5789 /* 5790 * We have to allocate an ip_xmit_attr_t before we grab 5791 * conn_lock and we need to hold conn_lock once we've check 5792 * conn_same_as_last_v6 to handle concurrent send* on a socket. 5793 */ 5794 if (msg->msg_controllen == 0) { 5795 ixa = conn_get_ixa(connp, B_FALSE); 5796 if (ixa == NULL) { 5797 BUMP_MIB(&is->is_rawip_mib, rawipOutErrors); 5798 return (ENOMEM); 5799 } 5800 } else { 5801 ixa = NULL; 5802 } 5803 mutex_enter(&connp->conn_lock); 5804 if (icmp->icmp_delayed_error != 0) { 5805 sin_t *sin2 = (sin_t *)&icmp->icmp_delayed_addr; 5806 5807 error = icmp->icmp_delayed_error; 5808 icmp->icmp_delayed_error = 0; 5809 5810 /* Compare IP address */ 5811 5812 if (sin->sin_addr.s_addr == sin2->sin_addr.s_addr) { 5813 mutex_exit(&connp->conn_lock); 5814 BUMP_MIB(&is->is_rawip_mib, rawipOutErrors); 5815 if (ixa != NULL) 5816 ixa_refrele(ixa); 5817 return (error); 5818 } 5819 } 5820 5821 if (msg->msg_controllen != 0) { 5822 mutex_exit(&connp->conn_lock); 5823 ASSERT(ixa == NULL); 5824 error = icmp_output_ancillary(connp, sin, NULL, mp, 5825 NULL, msg, cr, pid); 5826 } else if (conn_same_as_last_v4(connp, sin) && 5827 ipsec_outbound_policy_current(ixa)) { 5828 /* icmp_output_lastdst drops conn_lock */ 5829 error = icmp_output_lastdst(connp, mp, cr, pid, ixa); 5830 } else { 5831 /* icmp_output_newdst drops conn_lock */ 5832 error = icmp_output_newdst(connp, mp, sin, NULL, cr, 5833 pid, ixa); 5834 } 5835 ASSERT(MUTEX_NOT_HELD(&connp->conn_lock)); 5836 if (is->is_sendto_ignerr) 5837 return (0); 5838 else 5839 return (error); 5840 default: 5841 return (EINVAL); 5842 } 5843 } 5844 5845 sock_downcalls_t sock_rawip_downcalls = { 5846 rawip_activate, 5847 rawip_accept, 5848 rawip_bind, 5849 rawip_listen, 5850 rawip_connect, 5851 rawip_getpeername, 5852 rawip_getsockname, 5853 rawip_getsockopt, 5854 rawip_setsockopt, 5855 rawip_send, 5856 NULL, 5857 NULL, 5858 NULL, 5859 rawip_shutdown, 5860 rawip_clr_flowctrl, 5861 rawip_ioctl, 5862 rawip_close 5863 }; 5864