1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright 2010 Sun Microsystems, Inc. All rights reserved. 23 * Use is subject to license terms. 24 */ 25 /* Copyright (c) 1990 Mentat Inc. */ 26 27 #include <sys/types.h> 28 #include <sys/stream.h> 29 #include <sys/stropts.h> 30 #include <sys/strlog.h> 31 #include <sys/strsun.h> 32 #define _SUN_TPI_VERSION 2 33 #include <sys/tihdr.h> 34 #include <sys/timod.h> 35 #include <sys/ddi.h> 36 #include <sys/sunddi.h> 37 #include <sys/strsubr.h> 38 #include <sys/suntpi.h> 39 #include <sys/xti_inet.h> 40 #include <sys/cmn_err.h> 41 #include <sys/kmem.h> 42 #include <sys/cred_impl.h> 43 #include <sys/policy.h> 44 #include <sys/priv.h> 45 #include <sys/ucred.h> 46 #include <sys/zone.h> 47 48 #include <sys/sockio.h> 49 #include <sys/socket.h> 50 #include <sys/socketvar.h> 51 #include <sys/vtrace.h> 52 #include <sys/sdt.h> 53 #include <sys/debug.h> 54 #include <sys/isa_defs.h> 55 #include <sys/random.h> 56 #include <netinet/in.h> 57 #include <netinet/ip6.h> 58 #include <netinet/icmp6.h> 59 #include <netinet/udp.h> 60 61 #include <inet/common.h> 62 #include <inet/ip.h> 63 #include <inet/ip_impl.h> 64 #include <inet/ipsec_impl.h> 65 #include <inet/ip6.h> 66 #include <inet/ip_ire.h> 67 #include <inet/ip_if.h> 68 #include <inet/ip_multi.h> 69 #include <inet/ip_ndp.h> 70 #include <inet/proto_set.h> 71 #include <inet/mib2.h> 72 #include <inet/nd.h> 73 #include <inet/optcom.h> 74 #include <inet/snmpcom.h> 75 #include <inet/kstatcom.h> 76 #include <inet/ipclassifier.h> 77 78 #include <sys/tsol/label.h> 79 #include <sys/tsol/tnet.h> 80 81 #include <inet/rawip_impl.h> 82 83 #include <sys/disp.h> 84 85 /* 86 * Synchronization notes: 87 * 88 * RAWIP is MT and uses the usual kernel synchronization primitives. We use 89 * conn_lock to protect the icmp_t. 90 * 91 * Plumbing notes: 92 * ICMP is always a device driver. For compatibility with mibopen() code 93 * it is possible to I_PUSH "icmp", but that results in pushing a passthrough 94 * dummy module. 95 */ 96 97 static void icmp_addr_req(queue_t *q, mblk_t *mp); 98 static void icmp_tpi_bind(queue_t *q, mblk_t *mp); 99 static void icmp_bind_proto(icmp_t *icmp); 100 static int icmp_build_hdr_template(conn_t *, const in6_addr_t *, 101 const in6_addr_t *, uint32_t); 102 static void icmp_capability_req(queue_t *q, mblk_t *mp); 103 static int icmp_close(queue_t *q, int flags); 104 static void icmp_close_free(conn_t *); 105 static void icmp_tpi_connect(queue_t *q, mblk_t *mp); 106 static void icmp_tpi_disconnect(queue_t *q, mblk_t *mp); 107 static void icmp_err_ack(queue_t *q, mblk_t *mp, t_scalar_t t_error, 108 int sys_error); 109 static void icmp_err_ack_prim(queue_t *q, mblk_t *mp, t_scalar_t primitive, 110 t_scalar_t tlierr, int sys_error); 111 static void icmp_icmp_input(void *arg1, mblk_t *mp, void *arg2, 112 ip_recv_attr_t *); 113 static void icmp_icmp_error_ipv6(conn_t *connp, mblk_t *mp, 114 ip_recv_attr_t *); 115 static void icmp_info_req(queue_t *q, mblk_t *mp); 116 static void icmp_input(void *, mblk_t *, void *, ip_recv_attr_t *); 117 static conn_t *icmp_open(int family, cred_t *credp, int *err, int flags); 118 static int icmp_openv4(queue_t *q, dev_t *devp, int flag, int sflag, 119 cred_t *credp); 120 static int icmp_openv6(queue_t *q, dev_t *devp, int flag, int sflag, 121 cred_t *credp); 122 static boolean_t icmp_opt_allow_udr_set(t_scalar_t level, t_scalar_t name); 123 int icmp_opt_set(conn_t *connp, uint_t optset_context, 124 int level, int name, uint_t inlen, 125 uchar_t *invalp, uint_t *outlenp, uchar_t *outvalp, 126 void *thisdg_attrs, cred_t *cr); 127 int icmp_opt_get(conn_t *connp, int level, int name, 128 uchar_t *ptr); 129 static int icmp_output_newdst(conn_t *connp, mblk_t *data_mp, sin_t *sin, 130 sin6_t *sin6, cred_t *cr, pid_t pid, ip_xmit_attr_t *ixa); 131 static int icmp_param_get(queue_t *q, mblk_t *mp, caddr_t cp, cred_t *cr); 132 static boolean_t icmp_param_register(IDP *ndp, icmpparam_t *icmppa, int cnt); 133 static int icmp_param_set(queue_t *q, mblk_t *mp, char *value, 134 caddr_t cp, cred_t *cr); 135 static mblk_t *icmp_prepend_hdr(conn_t *, ip_xmit_attr_t *, const ip_pkt_t *, 136 const in6_addr_t *, const in6_addr_t *, uint32_t, mblk_t *, int *); 137 static mblk_t *icmp_prepend_header_template(conn_t *, ip_xmit_attr_t *, 138 mblk_t *, const in6_addr_t *, uint32_t, int *); 139 static int icmp_snmp_set(queue_t *q, t_scalar_t level, t_scalar_t name, 140 uchar_t *ptr, int len); 141 static void icmp_ud_err(queue_t *q, mblk_t *mp, t_scalar_t err); 142 static void icmp_tpi_unbind(queue_t *q, mblk_t *mp); 143 static void icmp_wput(queue_t *q, mblk_t *mp); 144 static void icmp_wput_fallback(queue_t *q, mblk_t *mp); 145 static void icmp_wput_other(queue_t *q, mblk_t *mp); 146 static void icmp_wput_iocdata(queue_t *q, mblk_t *mp); 147 static void icmp_wput_restricted(queue_t *q, mblk_t *mp); 148 static void icmp_ulp_recv(conn_t *, mblk_t *, uint_t); 149 150 static void *rawip_stack_init(netstackid_t stackid, netstack_t *ns); 151 static void rawip_stack_fini(netstackid_t stackid, void *arg); 152 153 static void *rawip_kstat_init(netstackid_t stackid); 154 static void rawip_kstat_fini(netstackid_t stackid, kstat_t *ksp); 155 static int rawip_kstat_update(kstat_t *kp, int rw); 156 static void rawip_stack_shutdown(netstackid_t stackid, void *arg); 157 158 /* Common routines for TPI and socket module */ 159 static conn_t *rawip_do_open(int, cred_t *, int *, int); 160 static void rawip_do_close(conn_t *); 161 static int rawip_do_bind(conn_t *, struct sockaddr *, socklen_t); 162 static int rawip_do_unbind(conn_t *); 163 static int rawip_do_connect(conn_t *, const struct sockaddr *, socklen_t, 164 cred_t *, pid_t); 165 166 int rawip_getsockname(sock_lower_handle_t, struct sockaddr *, 167 socklen_t *, cred_t *); 168 int rawip_getpeername(sock_lower_handle_t, struct sockaddr *, 169 socklen_t *, cred_t *); 170 171 static struct module_info icmp_mod_info = { 172 5707, "icmp", 1, INFPSZ, 512, 128 173 }; 174 175 /* 176 * Entry points for ICMP as a device. 177 * We have separate open functions for the /dev/icmp and /dev/icmp6 devices. 178 */ 179 static struct qinit icmprinitv4 = { 180 NULL, NULL, icmp_openv4, icmp_close, NULL, &icmp_mod_info 181 }; 182 183 static struct qinit icmprinitv6 = { 184 NULL, NULL, icmp_openv6, icmp_close, NULL, &icmp_mod_info 185 }; 186 187 static struct qinit icmpwinit = { 188 (pfi_t)icmp_wput, (pfi_t)ip_wsrv, NULL, NULL, NULL, &icmp_mod_info 189 }; 190 191 /* ICMP entry point during fallback */ 192 static struct qinit icmp_fallback_sock_winit = { 193 (pfi_t)icmp_wput_fallback, NULL, NULL, NULL, NULL, &icmp_mod_info 194 }; 195 196 /* For AF_INET aka /dev/icmp */ 197 struct streamtab icmpinfov4 = { 198 &icmprinitv4, &icmpwinit 199 }; 200 201 /* For AF_INET6 aka /dev/icmp6 */ 202 struct streamtab icmpinfov6 = { 203 &icmprinitv6, &icmpwinit 204 }; 205 206 /* Default structure copied into T_INFO_ACK messages */ 207 static struct T_info_ack icmp_g_t_info_ack = { 208 T_INFO_ACK, 209 IP_MAXPACKET, /* TSDU_size. icmp allows maximum size messages. */ 210 T_INVALID, /* ETSDU_size. icmp does not support expedited data. */ 211 T_INVALID, /* CDATA_size. icmp does not support connect data. */ 212 T_INVALID, /* DDATA_size. icmp does not support disconnect data. */ 213 0, /* ADDR_size - filled in later. */ 214 0, /* OPT_size - not initialized here */ 215 IP_MAXPACKET, /* TIDU_size. icmp allows maximum size messages. */ 216 T_CLTS, /* SERV_type. icmp supports connection-less. */ 217 TS_UNBND, /* CURRENT_state. This is set from icmp_state. */ 218 (XPG4_1|SENDZERO) /* PROVIDER_flag */ 219 }; 220 221 /* 222 * Table of ND variables supported by icmp. These are loaded into is_nd 223 * when the stack instance is created. 224 * All of these are alterable, within the min/max values given, at run time. 225 */ 226 static icmpparam_t icmp_param_arr[] = { 227 /* min max value name */ 228 { 0, 128, 32, "icmp_wroff_extra" }, 229 { 1, 255, 255, "icmp_ipv4_ttl" }, 230 { 0, IPV6_MAX_HOPS, IPV6_DEFAULT_HOPS, "icmp_ipv6_hoplimit"}, 231 { 0, 1, 1, "icmp_bsd_compat" }, 232 { 4096, 65536, 8192, "icmp_xmit_hiwat"}, 233 { 0, 65536, 1024, "icmp_xmit_lowat"}, 234 { 4096, 65536, 8192, "icmp_recv_hiwat"}, 235 { 65536, 1024*1024*1024, 256*1024, "icmp_max_buf"}, 236 { 0, 1, 0, "icmp_pmtu_discovery" }, 237 { 0, 1, 0, "icmp_sendto_ignerr" }, 238 }; 239 #define is_wroff_extra is_param_arr[0].icmp_param_value 240 #define is_ipv4_ttl is_param_arr[1].icmp_param_value 241 #define is_ipv6_hoplimit is_param_arr[2].icmp_param_value 242 #define is_bsd_compat is_param_arr[3].icmp_param_value 243 #define is_xmit_hiwat is_param_arr[4].icmp_param_value 244 #define is_xmit_lowat is_param_arr[5].icmp_param_value 245 #define is_recv_hiwat is_param_arr[6].icmp_param_value 246 #define is_max_buf is_param_arr[7].icmp_param_value 247 #define is_pmtu_discovery is_param_arr[8].icmp_param_value 248 #define is_sendto_ignerr is_param_arr[9].icmp_param_value 249 250 typedef union T_primitives *t_primp_t; 251 252 /* 253 * This routine is called to handle each O_T_BIND_REQ/T_BIND_REQ message 254 * passed to icmp_wput. 255 * It calls IP to verify the local IP address, and calls IP to insert 256 * the conn_t in the fanout table. 257 * If everything is ok it then sends the T_BIND_ACK back up. 258 */ 259 static void 260 icmp_tpi_bind(queue_t *q, mblk_t *mp) 261 { 262 int error; 263 struct sockaddr *sa; 264 struct T_bind_req *tbr; 265 socklen_t len; 266 sin_t *sin; 267 sin6_t *sin6; 268 icmp_t *icmp; 269 conn_t *connp = Q_TO_CONN(q); 270 mblk_t *mp1; 271 cred_t *cr; 272 273 /* 274 * All Solaris components should pass a db_credp 275 * for this TPI message, hence we ASSERT. 276 * But in case there is some other M_PROTO that looks 277 * like a TPI message sent by some other kernel 278 * component, we check and return an error. 279 */ 280 cr = msg_getcred(mp, NULL); 281 ASSERT(cr != NULL); 282 if (cr == NULL) { 283 icmp_err_ack(q, mp, TSYSERR, EINVAL); 284 return; 285 } 286 287 icmp = connp->conn_icmp; 288 if ((mp->b_wptr - mp->b_rptr) < sizeof (*tbr)) { 289 (void) mi_strlog(q, 1, SL_ERROR|SL_TRACE, 290 "icmp_bind: bad req, len %u", 291 (uint_t)(mp->b_wptr - mp->b_rptr)); 292 icmp_err_ack(q, mp, TPROTO, 0); 293 return; 294 } 295 296 if (icmp->icmp_state != TS_UNBND) { 297 (void) mi_strlog(q, 1, SL_ERROR|SL_TRACE, 298 "icmp_bind: bad state, %u", icmp->icmp_state); 299 icmp_err_ack(q, mp, TOUTSTATE, 0); 300 return; 301 } 302 303 /* 304 * Reallocate the message to make sure we have enough room for an 305 * address. 306 */ 307 mp1 = reallocb(mp, sizeof (struct T_bind_ack) + sizeof (sin6_t), 1); 308 if (mp1 == NULL) { 309 icmp_err_ack(q, mp, TSYSERR, ENOMEM); 310 return; 311 } 312 mp = mp1; 313 314 /* Reset the message type in preparation for shipping it back. */ 315 DB_TYPE(mp) = M_PCPROTO; 316 tbr = (struct T_bind_req *)mp->b_rptr; 317 len = tbr->ADDR_length; 318 switch (len) { 319 case 0: /* request for a generic port */ 320 tbr->ADDR_offset = sizeof (struct T_bind_req); 321 if (connp->conn_family == AF_INET) { 322 tbr->ADDR_length = sizeof (sin_t); 323 sin = (sin_t *)&tbr[1]; 324 *sin = sin_null; 325 sin->sin_family = AF_INET; 326 mp->b_wptr = (uchar_t *)&sin[1]; 327 sa = (struct sockaddr *)sin; 328 len = sizeof (sin_t); 329 } else { 330 ASSERT(connp->conn_family == AF_INET6); 331 tbr->ADDR_length = sizeof (sin6_t); 332 sin6 = (sin6_t *)&tbr[1]; 333 *sin6 = sin6_null; 334 sin6->sin6_family = AF_INET6; 335 mp->b_wptr = (uchar_t *)&sin6[1]; 336 sa = (struct sockaddr *)sin6; 337 len = sizeof (sin6_t); 338 } 339 break; 340 341 case sizeof (sin_t): /* Complete IPv4 address */ 342 sa = (struct sockaddr *)mi_offset_param(mp, tbr->ADDR_offset, 343 sizeof (sin_t)); 344 break; 345 346 case sizeof (sin6_t): /* Complete IPv6 address */ 347 sa = (struct sockaddr *)mi_offset_param(mp, 348 tbr->ADDR_offset, sizeof (sin6_t)); 349 break; 350 351 default: 352 (void) mi_strlog(q, 1, SL_ERROR|SL_TRACE, 353 "icmp_bind: bad ADDR_length %u", tbr->ADDR_length); 354 icmp_err_ack(q, mp, TBADADDR, 0); 355 return; 356 } 357 358 error = rawip_do_bind(connp, sa, len); 359 if (error != 0) { 360 if (error > 0) { 361 icmp_err_ack(q, mp, TSYSERR, error); 362 } else { 363 icmp_err_ack(q, mp, -error, 0); 364 } 365 } else { 366 tbr->PRIM_type = T_BIND_ACK; 367 qreply(q, mp); 368 } 369 } 370 371 static int 372 rawip_do_bind(conn_t *connp, struct sockaddr *sa, socklen_t len) 373 { 374 sin_t *sin; 375 sin6_t *sin6; 376 icmp_t *icmp = connp->conn_icmp; 377 int error = 0; 378 ip_laddr_t laddr_type = IPVL_UNICAST_UP; /* INADDR_ANY */ 379 in_port_t lport; /* Network byte order */ 380 ipaddr_t v4src; /* Set if AF_INET */ 381 in6_addr_t v6src; 382 uint_t scopeid = 0; 383 zoneid_t zoneid = IPCL_ZONEID(connp); 384 ip_stack_t *ipst = connp->conn_netstack->netstack_ip; 385 386 if (sa == NULL || !OK_32PTR((char *)sa)) { 387 return (EINVAL); 388 } 389 390 switch (len) { 391 case sizeof (sin_t): /* Complete IPv4 address */ 392 sin = (sin_t *)sa; 393 if (sin->sin_family != AF_INET || 394 connp->conn_family != AF_INET) { 395 /* TSYSERR, EAFNOSUPPORT */ 396 return (EAFNOSUPPORT); 397 } 398 v4src = sin->sin_addr.s_addr; 399 IN6_IPADDR_TO_V4MAPPED(v4src, &v6src); 400 if (v4src != INADDR_ANY) { 401 laddr_type = ip_laddr_verify_v4(v4src, zoneid, ipst, 402 B_TRUE); 403 } 404 lport = sin->sin_port; 405 break; 406 case sizeof (sin6_t): /* Complete IPv6 address */ 407 sin6 = (sin6_t *)sa; 408 if (sin6->sin6_family != AF_INET6 || 409 connp->conn_family != AF_INET6) { 410 /* TSYSERR, EAFNOSUPPORT */ 411 return (EAFNOSUPPORT); 412 } 413 /* No support for mapped addresses on raw sockets */ 414 if (IN6_IS_ADDR_V4MAPPED(&sin6->sin6_addr)) { 415 /* TSYSERR, EADDRNOTAVAIL */ 416 return (EADDRNOTAVAIL); 417 } 418 v6src = sin6->sin6_addr; 419 if (!IN6_IS_ADDR_UNSPECIFIED(&v6src)) { 420 if (IN6_IS_ADDR_LINKSCOPE(&v6src)) 421 scopeid = sin6->sin6_scope_id; 422 laddr_type = ip_laddr_verify_v6(&v6src, zoneid, ipst, 423 B_TRUE, scopeid); 424 } 425 lport = sin6->sin6_port; 426 break; 427 428 default: 429 /* TBADADDR */ 430 return (EADDRNOTAVAIL); 431 } 432 433 /* Is the local address a valid unicast, multicast, or broadcast? */ 434 if (laddr_type == IPVL_BAD) 435 return (EADDRNOTAVAIL); 436 437 /* 438 * The state must be TS_UNBND. 439 */ 440 mutex_enter(&connp->conn_lock); 441 if (icmp->icmp_state != TS_UNBND) { 442 mutex_exit(&connp->conn_lock); 443 return (-TOUTSTATE); 444 } 445 446 /* 447 * Copy the source address into our icmp structure. This address 448 * may still be zero; if so, ip will fill in the correct address 449 * each time an outbound packet is passed to it. 450 * If we are binding to a broadcast or multicast address then 451 * we just set the conn_bound_addr since we don't want to use 452 * that as the source address when sending. 453 */ 454 connp->conn_bound_addr_v6 = v6src; 455 connp->conn_laddr_v6 = v6src; 456 if (scopeid != 0) { 457 connp->conn_ixa->ixa_flags |= IXAF_SCOPEID_SET; 458 connp->conn_ixa->ixa_scopeid = scopeid; 459 connp->conn_incoming_ifindex = scopeid; 460 } else { 461 connp->conn_ixa->ixa_flags &= ~IXAF_SCOPEID_SET; 462 connp->conn_incoming_ifindex = connp->conn_bound_if; 463 } 464 465 switch (laddr_type) { 466 case IPVL_UNICAST_UP: 467 case IPVL_UNICAST_DOWN: 468 connp->conn_saddr_v6 = v6src; 469 connp->conn_mcbc_bind = B_FALSE; 470 break; 471 case IPVL_MCAST: 472 case IPVL_BCAST: 473 /* ip_set_destination will pick a source address later */ 474 connp->conn_saddr_v6 = ipv6_all_zeros; 475 connp->conn_mcbc_bind = B_TRUE; 476 break; 477 } 478 479 /* Any errors after this point should use late_error */ 480 481 /* 482 * Use sin_port/sin6_port since applications like psh use SOCK_RAW 483 * with IPPROTO_TCP. 484 */ 485 connp->conn_lport = lport; 486 connp->conn_fport = 0; 487 488 if (connp->conn_family == AF_INET) { 489 ASSERT(connp->conn_ipversion == IPV4_VERSION); 490 } else { 491 ASSERT(connp->conn_ipversion == IPV6_VERSION); 492 } 493 494 icmp->icmp_state = TS_IDLE; 495 496 /* 497 * We create an initial header template here to make a subsequent 498 * sendto have a starting point. Since conn_last_dst is zero the 499 * first sendto will always follow the 'dst changed' code path. 500 * Note that we defer massaging options and the related checksum 501 * adjustment until we have a destination address. 502 */ 503 error = icmp_build_hdr_template(connp, &connp->conn_saddr_v6, 504 &connp->conn_faddr_v6, connp->conn_flowinfo); 505 if (error != 0) { 506 mutex_exit(&connp->conn_lock); 507 goto late_error; 508 } 509 /* Just in case */ 510 connp->conn_faddr_v6 = ipv6_all_zeros; 511 connp->conn_v6lastdst = ipv6_all_zeros; 512 mutex_exit(&connp->conn_lock); 513 514 error = ip_laddr_fanout_insert(connp); 515 if (error != 0) 516 goto late_error; 517 518 /* Bind succeeded */ 519 return (0); 520 521 late_error: 522 mutex_enter(&connp->conn_lock); 523 connp->conn_saddr_v6 = ipv6_all_zeros; 524 connp->conn_bound_addr_v6 = ipv6_all_zeros; 525 connp->conn_laddr_v6 = ipv6_all_zeros; 526 if (scopeid != 0) { 527 connp->conn_ixa->ixa_flags &= ~IXAF_SCOPEID_SET; 528 connp->conn_incoming_ifindex = connp->conn_bound_if; 529 } 530 icmp->icmp_state = TS_UNBND; 531 connp->conn_v6lastdst = ipv6_all_zeros; 532 connp->conn_lport = 0; 533 534 /* Restore the header that was built above - different source address */ 535 (void) icmp_build_hdr_template(connp, &connp->conn_saddr_v6, 536 &connp->conn_faddr_v6, connp->conn_flowinfo); 537 mutex_exit(&connp->conn_lock); 538 return (error); 539 } 540 541 /* 542 * Tell IP to just bind to the protocol. 543 */ 544 static void 545 icmp_bind_proto(icmp_t *icmp) 546 { 547 conn_t *connp = icmp->icmp_connp; 548 549 mutex_enter(&connp->conn_lock); 550 connp->conn_saddr_v6 = ipv6_all_zeros; 551 connp->conn_laddr_v6 = ipv6_all_zeros; 552 connp->conn_faddr_v6 = ipv6_all_zeros; 553 connp->conn_v6lastdst = ipv6_all_zeros; 554 mutex_exit(&connp->conn_lock); 555 556 (void) ip_laddr_fanout_insert(connp); 557 } 558 559 /* 560 * This routine handles each T_CONN_REQ message passed to icmp. It 561 * associates a default destination address with the stream. 562 * 563 * After various error checks are completed, icmp_connect() lays 564 * the target address and port into the composite header template. 565 * Then we ask IP for information, including a source address if we didn't 566 * already have one. Finally we send up the T_OK_ACK reply message. 567 */ 568 static void 569 icmp_tpi_connect(queue_t *q, mblk_t *mp) 570 { 571 conn_t *connp = Q_TO_CONN(q); 572 struct T_conn_req *tcr; 573 struct sockaddr *sa; 574 socklen_t len; 575 int error; 576 cred_t *cr; 577 pid_t pid; 578 /* 579 * All Solaris components should pass a db_credp 580 * for this TPI message, hence we ASSERT. 581 * But in case there is some other M_PROTO that looks 582 * like a TPI message sent by some other kernel 583 * component, we check and return an error. 584 */ 585 cr = msg_getcred(mp, &pid); 586 ASSERT(cr != NULL); 587 if (cr == NULL) { 588 icmp_err_ack(q, mp, TSYSERR, EINVAL); 589 return; 590 } 591 592 tcr = (struct T_conn_req *)mp->b_rptr; 593 /* Sanity checks */ 594 if ((mp->b_wptr - mp->b_rptr) < sizeof (struct T_conn_req)) { 595 icmp_err_ack(q, mp, TPROTO, 0); 596 return; 597 } 598 599 if (tcr->OPT_length != 0) { 600 icmp_err_ack(q, mp, TBADOPT, 0); 601 return; 602 } 603 604 len = tcr->DEST_length; 605 606 switch (len) { 607 default: 608 icmp_err_ack(q, mp, TBADADDR, 0); 609 return; 610 case sizeof (sin_t): 611 sa = (struct sockaddr *)mi_offset_param(mp, tcr->DEST_offset, 612 sizeof (sin_t)); 613 break; 614 case sizeof (sin6_t): 615 sa = (struct sockaddr *)mi_offset_param(mp, 616 tcr->DEST_offset, sizeof (sin6_t)); 617 break; 618 } 619 620 error = proto_verify_ip_addr(connp->conn_family, sa, len); 621 if (error != 0) { 622 icmp_err_ack(q, mp, TSYSERR, error); 623 return; 624 } 625 626 error = rawip_do_connect(connp, sa, len, cr, pid); 627 if (error != 0) { 628 if (error < 0) { 629 icmp_err_ack(q, mp, -error, 0); 630 } else { 631 icmp_err_ack(q, mp, 0, error); 632 } 633 } else { 634 mblk_t *mp1; 635 636 /* 637 * We have to send a connection confirmation to 638 * keep TLI happy. 639 */ 640 if (connp->conn_family == AF_INET) { 641 mp1 = mi_tpi_conn_con(NULL, (char *)sa, 642 sizeof (sin_t), NULL, 0); 643 } else { 644 ASSERT(connp->conn_family == AF_INET6); 645 mp1 = mi_tpi_conn_con(NULL, (char *)sa, 646 sizeof (sin6_t), NULL, 0); 647 } 648 if (mp1 == NULL) { 649 icmp_err_ack(q, mp, TSYSERR, ENOMEM); 650 return; 651 } 652 653 /* 654 * Send ok_ack for T_CONN_REQ 655 */ 656 mp = mi_tpi_ok_ack_alloc(mp); 657 if (mp == NULL) { 658 /* Unable to reuse the T_CONN_REQ for the ack. */ 659 icmp_err_ack_prim(q, mp1, T_CONN_REQ, TSYSERR, ENOMEM); 660 return; 661 } 662 putnext(connp->conn_rq, mp); 663 putnext(connp->conn_rq, mp1); 664 } 665 } 666 667 static int 668 rawip_do_connect(conn_t *connp, const struct sockaddr *sa, socklen_t len, 669 cred_t *cr, pid_t pid) 670 { 671 icmp_t *icmp; 672 sin_t *sin; 673 sin6_t *sin6; 674 int error; 675 uint16_t dstport; 676 ipaddr_t v4dst; 677 in6_addr_t v6dst; 678 uint32_t flowinfo; 679 ip_xmit_attr_t *ixa; 680 uint_t scopeid = 0; 681 uint_t srcid = 0; 682 in6_addr_t v6src = connp->conn_saddr_v6; 683 684 icmp = connp->conn_icmp; 685 686 if (sa == NULL || !OK_32PTR((char *)sa)) { 687 return (EINVAL); 688 } 689 690 ASSERT(sa != NULL && len != 0); 691 692 /* 693 * Determine packet type based on type of address passed in 694 * the request should contain an IPv4 or IPv6 address. 695 * Make sure that address family matches the type of 696 * family of the address passed down. 697 */ 698 switch (len) { 699 case sizeof (sin_t): 700 sin = (sin_t *)sa; 701 702 v4dst = sin->sin_addr.s_addr; 703 dstport = sin->sin_port; 704 IN6_IPADDR_TO_V4MAPPED(v4dst, &v6dst); 705 ASSERT(connp->conn_ipversion == IPV4_VERSION); 706 break; 707 708 case sizeof (sin6_t): 709 sin6 = (sin6_t *)sa; 710 711 /* No support for mapped addresses on raw sockets */ 712 if (IN6_IS_ADDR_V4MAPPED(&sin6->sin6_addr)) { 713 return (EADDRNOTAVAIL); 714 } 715 v6dst = sin6->sin6_addr; 716 dstport = sin6->sin6_port; 717 ASSERT(connp->conn_ipversion == IPV6_VERSION); 718 flowinfo = sin6->sin6_flowinfo; 719 if (IN6_IS_ADDR_LINKLOCAL(&sin6->sin6_addr)) 720 scopeid = sin6->sin6_scope_id; 721 srcid = sin6->__sin6_src_id; 722 if (srcid != 0 && IN6_IS_ADDR_UNSPECIFIED(&v6src)) { 723 ip_srcid_find_id(srcid, &v6src, IPCL_ZONEID(connp), 724 connp->conn_netstack); 725 } 726 break; 727 } 728 729 /* 730 * If there is a different thread using conn_ixa then we get a new 731 * copy and cut the old one loose from conn_ixa. Otherwise we use 732 * conn_ixa and prevent any other thread from using/changing it. 733 * Once connect() is done other threads can use conn_ixa since the 734 * refcnt will be back at one. 735 */ 736 ixa = conn_get_ixa(connp, B_TRUE); 737 if (ixa == NULL) 738 return (ENOMEM); 739 740 ASSERT(ixa->ixa_refcnt >= 2); 741 ASSERT(ixa == connp->conn_ixa); 742 743 mutex_enter(&connp->conn_lock); 744 /* 745 * This icmp_t must have bound already before doing a connect. 746 * Reject if a connect is in progress (we drop conn_lock during 747 * rawip_do_connect). 748 */ 749 if (icmp->icmp_state == TS_UNBND || icmp->icmp_state == TS_WCON_CREQ) { 750 mutex_exit(&connp->conn_lock); 751 ixa_refrele(ixa); 752 return (-TOUTSTATE); 753 } 754 755 if (icmp->icmp_state == TS_DATA_XFER) { 756 /* Already connected - clear out state */ 757 if (connp->conn_mcbc_bind) 758 connp->conn_saddr_v6 = ipv6_all_zeros; 759 else 760 connp->conn_saddr_v6 = connp->conn_bound_addr_v6; 761 connp->conn_laddr_v6 = connp->conn_bound_addr_v6; 762 connp->conn_faddr_v6 = ipv6_all_zeros; 763 icmp->icmp_state = TS_IDLE; 764 } 765 766 /* 767 * Use sin_port/sin6_port since applications like psh use SOCK_RAW 768 * with IPPROTO_TCP. 769 */ 770 connp->conn_fport = dstport; 771 if (connp->conn_ipversion == IPV4_VERSION) { 772 /* 773 * Interpret a zero destination to mean loopback. 774 * Update the T_CONN_REQ (sin/sin6) since it is used to 775 * generate the T_CONN_CON. 776 */ 777 if (v4dst == INADDR_ANY) { 778 v4dst = htonl(INADDR_LOOPBACK); 779 IN6_IPADDR_TO_V4MAPPED(v4dst, &v6dst); 780 ASSERT(connp->conn_family == AF_INET); 781 sin->sin_addr.s_addr = v4dst; 782 } 783 connp->conn_faddr_v6 = v6dst; 784 connp->conn_flowinfo = 0; 785 } else { 786 ASSERT(connp->conn_ipversion == IPV6_VERSION); 787 /* 788 * Interpret a zero destination to mean loopback. 789 * Update the T_CONN_REQ (sin/sin6) since it is used to 790 * generate the T_CONN_CON. 791 */ 792 if (IN6_IS_ADDR_UNSPECIFIED(&v6dst)) { 793 v6dst = ipv6_loopback; 794 sin6->sin6_addr = v6dst; 795 } 796 connp->conn_faddr_v6 = v6dst; 797 connp->conn_flowinfo = flowinfo; 798 } 799 800 /* 801 * We update our cred/cpid based on the caller of connect 802 */ 803 if (connp->conn_cred != cr) { 804 crhold(cr); 805 crfree(connp->conn_cred); 806 connp->conn_cred = cr; 807 } 808 connp->conn_cpid = pid; 809 ixa->ixa_cred = cr; 810 ixa->ixa_cpid = pid; 811 if (is_system_labeled()) { 812 /* We need to restart with a label based on the cred */ 813 ip_xmit_attr_restore_tsl(ixa, ixa->ixa_cred); 814 } 815 816 if (scopeid != 0) { 817 ixa->ixa_flags |= IXAF_SCOPEID_SET; 818 ixa->ixa_scopeid = scopeid; 819 connp->conn_incoming_ifindex = scopeid; 820 } else { 821 ixa->ixa_flags &= ~IXAF_SCOPEID_SET; 822 connp->conn_incoming_ifindex = connp->conn_bound_if; 823 } 824 825 /* 826 * conn_connect will drop conn_lock and reacquire it. 827 * To prevent a send* from messing with this icmp_t while the lock 828 * is dropped we set icmp_state and clear conn_v6lastdst. 829 * That will make all send* fail with EISCONN. 830 */ 831 connp->conn_v6lastdst = ipv6_all_zeros; 832 icmp->icmp_state = TS_WCON_CREQ; 833 834 error = conn_connect(connp, NULL, IPDF_ALLOW_MCBC); 835 mutex_exit(&connp->conn_lock); 836 if (error != 0) 837 goto connect_failed; 838 839 /* 840 * The addresses have been verified. Time to insert in 841 * the correct fanout list. 842 */ 843 error = ipcl_conn_insert(connp); 844 if (error != 0) 845 goto connect_failed; 846 847 mutex_enter(&connp->conn_lock); 848 error = icmp_build_hdr_template(connp, &connp->conn_saddr_v6, 849 &connp->conn_faddr_v6, connp->conn_flowinfo); 850 if (error != 0) { 851 mutex_exit(&connp->conn_lock); 852 goto connect_failed; 853 } 854 855 icmp->icmp_state = TS_DATA_XFER; 856 /* Record this as the "last" send even though we haven't sent any */ 857 connp->conn_v6lastdst = connp->conn_faddr_v6; 858 connp->conn_lastipversion = connp->conn_ipversion; 859 connp->conn_lastdstport = connp->conn_fport; 860 connp->conn_lastflowinfo = connp->conn_flowinfo; 861 connp->conn_lastscopeid = scopeid; 862 connp->conn_lastsrcid = srcid; 863 /* Also remember a source to use together with lastdst */ 864 connp->conn_v6lastsrc = v6src; 865 mutex_exit(&connp->conn_lock); 866 867 ixa_refrele(ixa); 868 return (0); 869 870 connect_failed: 871 if (ixa != NULL) 872 ixa_refrele(ixa); 873 mutex_enter(&connp->conn_lock); 874 icmp->icmp_state = TS_IDLE; 875 /* In case the source address was set above */ 876 if (connp->conn_mcbc_bind) 877 connp->conn_saddr_v6 = ipv6_all_zeros; 878 else 879 connp->conn_saddr_v6 = connp->conn_bound_addr_v6; 880 connp->conn_laddr_v6 = connp->conn_bound_addr_v6; 881 connp->conn_faddr_v6 = ipv6_all_zeros; 882 connp->conn_v6lastdst = ipv6_all_zeros; 883 connp->conn_flowinfo = 0; 884 885 (void) icmp_build_hdr_template(connp, &connp->conn_saddr_v6, 886 &connp->conn_faddr_v6, connp->conn_flowinfo); 887 mutex_exit(&connp->conn_lock); 888 return (error); 889 } 890 891 static void 892 rawip_do_close(conn_t *connp) 893 { 894 ASSERT(connp != NULL && IPCL_IS_RAWIP(connp)); 895 896 ip_quiesce_conn(connp); 897 898 if (!IPCL_IS_NONSTR(connp)) { 899 qprocsoff(connp->conn_rq); 900 } 901 902 icmp_close_free(connp); 903 904 /* 905 * Now we are truly single threaded on this stream, and can 906 * delete the things hanging off the connp, and finally the connp. 907 * We removed this connp from the fanout list, it cannot be 908 * accessed thru the fanouts, and we already waited for the 909 * conn_ref to drop to 0. We are already in close, so 910 * there cannot be any other thread from the top. qprocsoff 911 * has completed, and service has completed or won't run in 912 * future. 913 */ 914 ASSERT(connp->conn_ref == 1); 915 916 if (!IPCL_IS_NONSTR(connp)) { 917 inet_minor_free(connp->conn_minor_arena, connp->conn_dev); 918 } else { 919 ip_free_helper_stream(connp); 920 } 921 922 connp->conn_ref--; 923 ipcl_conn_destroy(connp); 924 } 925 926 static int 927 icmp_close(queue_t *q, int flags) 928 { 929 conn_t *connp; 930 931 if (flags & SO_FALLBACK) { 932 /* 933 * stream is being closed while in fallback 934 * simply free the resources that were allocated 935 */ 936 inet_minor_free(WR(q)->q_ptr, (dev_t)(RD(q)->q_ptr)); 937 qprocsoff(q); 938 goto done; 939 } 940 941 connp = Q_TO_CONN(q); 942 (void) rawip_do_close(connp); 943 done: 944 q->q_ptr = WR(q)->q_ptr = NULL; 945 return (0); 946 } 947 948 static void 949 icmp_close_free(conn_t *connp) 950 { 951 icmp_t *icmp = connp->conn_icmp; 952 953 if (icmp->icmp_filter != NULL) { 954 kmem_free(icmp->icmp_filter, sizeof (icmp6_filter_t)); 955 icmp->icmp_filter = NULL; 956 } 957 958 /* 959 * Clear any fields which the kmem_cache constructor clears. 960 * Only icmp_connp needs to be preserved. 961 * TBD: We should make this more efficient to avoid clearing 962 * everything. 963 */ 964 ASSERT(icmp->icmp_connp == connp); 965 bzero(icmp, sizeof (icmp_t)); 966 icmp->icmp_connp = connp; 967 } 968 969 /* 970 * This routine handles each T_DISCON_REQ message passed to icmp 971 * as an indicating that ICMP is no longer connected. This results 972 * in telling IP to restore the binding to just the local address. 973 */ 974 static int 975 icmp_do_disconnect(conn_t *connp) 976 { 977 icmp_t *icmp = connp->conn_icmp; 978 int error; 979 980 mutex_enter(&connp->conn_lock); 981 if (icmp->icmp_state != TS_DATA_XFER) { 982 mutex_exit(&connp->conn_lock); 983 return (-TOUTSTATE); 984 } 985 if (connp->conn_mcbc_bind) 986 connp->conn_saddr_v6 = ipv6_all_zeros; 987 else 988 connp->conn_saddr_v6 = connp->conn_bound_addr_v6; 989 connp->conn_laddr_v6 = connp->conn_bound_addr_v6; 990 connp->conn_faddr_v6 = ipv6_all_zeros; 991 icmp->icmp_state = TS_IDLE; 992 993 connp->conn_v6lastdst = ipv6_all_zeros; 994 error = icmp_build_hdr_template(connp, &connp->conn_saddr_v6, 995 &connp->conn_faddr_v6, connp->conn_flowinfo); 996 mutex_exit(&connp->conn_lock); 997 if (error != 0) 998 return (error); 999 1000 /* 1001 * Tell IP to remove the full binding and revert 1002 * to the local address binding. 1003 */ 1004 return (ip_laddr_fanout_insert(connp)); 1005 } 1006 1007 static void 1008 icmp_tpi_disconnect(queue_t *q, mblk_t *mp) 1009 { 1010 conn_t *connp = Q_TO_CONN(q); 1011 int error; 1012 1013 /* 1014 * Allocate the largest primitive we need to send back 1015 * T_error_ack is > than T_ok_ack 1016 */ 1017 mp = reallocb(mp, sizeof (struct T_error_ack), 1); 1018 if (mp == NULL) { 1019 /* Unable to reuse the T_DISCON_REQ for the ack. */ 1020 icmp_err_ack_prim(q, mp, T_DISCON_REQ, TSYSERR, ENOMEM); 1021 return; 1022 } 1023 1024 error = icmp_do_disconnect(connp); 1025 1026 if (error != 0) { 1027 if (error > 0) { 1028 icmp_err_ack(q, mp, 0, error); 1029 } else { 1030 icmp_err_ack(q, mp, -error, 0); 1031 } 1032 } else { 1033 mp = mi_tpi_ok_ack_alloc(mp); 1034 ASSERT(mp != NULL); 1035 qreply(q, mp); 1036 } 1037 } 1038 1039 static int 1040 icmp_disconnect(conn_t *connp) 1041 { 1042 int error; 1043 1044 connp->conn_dgram_errind = B_FALSE; 1045 1046 error = icmp_do_disconnect(connp); 1047 1048 if (error < 0) 1049 error = proto_tlitosyserr(-error); 1050 return (error); 1051 } 1052 1053 /* This routine creates a T_ERROR_ACK message and passes it upstream. */ 1054 static void 1055 icmp_err_ack(queue_t *q, mblk_t *mp, t_scalar_t t_error, int sys_error) 1056 { 1057 if ((mp = mi_tpi_err_ack_alloc(mp, t_error, sys_error)) != NULL) 1058 qreply(q, mp); 1059 } 1060 1061 /* Shorthand to generate and send TPI error acks to our client */ 1062 static void 1063 icmp_err_ack_prim(queue_t *q, mblk_t *mp, t_scalar_t primitive, 1064 t_scalar_t t_error, int sys_error) 1065 { 1066 struct T_error_ack *teackp; 1067 1068 if ((mp = tpi_ack_alloc(mp, sizeof (struct T_error_ack), 1069 M_PCPROTO, T_ERROR_ACK)) != NULL) { 1070 teackp = (struct T_error_ack *)mp->b_rptr; 1071 teackp->ERROR_prim = primitive; 1072 teackp->TLI_error = t_error; 1073 teackp->UNIX_error = sys_error; 1074 qreply(q, mp); 1075 } 1076 } 1077 1078 /* 1079 * icmp_icmp_input is called as conn_recvicmp to process ICMP messages. 1080 * Generates the appropriate T_UDERROR_IND for permanent (non-transient) errors. 1081 * Assumes that IP has pulled up everything up to and including the ICMP header. 1082 */ 1083 /* ARGSUSED2 */ 1084 static void 1085 icmp_icmp_input(void *arg1, mblk_t *mp, void *arg2, ip_recv_attr_t *ira) 1086 { 1087 conn_t *connp = (conn_t *)arg1; 1088 icmp_t *icmp = connp->conn_icmp; 1089 icmph_t *icmph; 1090 ipha_t *ipha; 1091 int iph_hdr_length; 1092 sin_t sin; 1093 mblk_t *mp1; 1094 int error = 0; 1095 1096 ipha = (ipha_t *)mp->b_rptr; 1097 1098 ASSERT(OK_32PTR(mp->b_rptr)); 1099 1100 if (IPH_HDR_VERSION(ipha) != IPV4_VERSION) { 1101 ASSERT(IPH_HDR_VERSION(ipha) == IPV6_VERSION); 1102 icmp_icmp_error_ipv6(connp, mp, ira); 1103 return; 1104 } 1105 ASSERT(IPH_HDR_VERSION(ipha) == IPV4_VERSION); 1106 1107 /* Skip past the outer IP and ICMP headers */ 1108 ASSERT(IPH_HDR_LENGTH(ipha) == ira->ira_ip_hdr_length); 1109 iph_hdr_length = ira->ira_ip_hdr_length; 1110 icmph = (icmph_t *)&mp->b_rptr[iph_hdr_length]; 1111 ipha = (ipha_t *)&icmph[1]; /* Inner IP header */ 1112 1113 iph_hdr_length = IPH_HDR_LENGTH(ipha); 1114 1115 switch (icmph->icmph_type) { 1116 case ICMP_DEST_UNREACHABLE: 1117 switch (icmph->icmph_code) { 1118 case ICMP_FRAGMENTATION_NEEDED: { 1119 ipha_t *ipha; 1120 ip_xmit_attr_t *ixa; 1121 /* 1122 * IP has already adjusted the path MTU. 1123 * But we need to adjust DF for IPv4. 1124 */ 1125 if (connp->conn_ipversion != IPV4_VERSION) 1126 break; 1127 1128 ixa = conn_get_ixa(connp, B_FALSE); 1129 if (ixa == NULL || ixa->ixa_ire == NULL) { 1130 /* 1131 * Some other thread holds conn_ixa. We will 1132 * redo this on the next ICMP too big. 1133 */ 1134 if (ixa != NULL) 1135 ixa_refrele(ixa); 1136 break; 1137 } 1138 (void) ip_get_pmtu(ixa); 1139 1140 mutex_enter(&connp->conn_lock); 1141 ipha = (ipha_t *)connp->conn_ht_iphc; 1142 if (ixa->ixa_flags & IXAF_PMTU_IPV4_DF) { 1143 ipha->ipha_fragment_offset_and_flags |= 1144 IPH_DF_HTONS; 1145 } else { 1146 ipha->ipha_fragment_offset_and_flags &= 1147 ~IPH_DF_HTONS; 1148 } 1149 mutex_exit(&connp->conn_lock); 1150 ixa_refrele(ixa); 1151 break; 1152 } 1153 case ICMP_PORT_UNREACHABLE: 1154 case ICMP_PROTOCOL_UNREACHABLE: 1155 error = ECONNREFUSED; 1156 break; 1157 default: 1158 /* Transient errors */ 1159 break; 1160 } 1161 break; 1162 default: 1163 /* Transient errors */ 1164 break; 1165 } 1166 if (error == 0) { 1167 freemsg(mp); 1168 return; 1169 } 1170 1171 /* 1172 * Deliver T_UDERROR_IND when the application has asked for it. 1173 * The socket layer enables this automatically when connected. 1174 */ 1175 if (!connp->conn_dgram_errind) { 1176 freemsg(mp); 1177 return; 1178 } 1179 1180 sin = sin_null; 1181 sin.sin_family = AF_INET; 1182 sin.sin_addr.s_addr = ipha->ipha_dst; 1183 1184 if (IPCL_IS_NONSTR(connp)) { 1185 mutex_enter(&connp->conn_lock); 1186 if (icmp->icmp_state == TS_DATA_XFER) { 1187 if (sin.sin_addr.s_addr == connp->conn_faddr_v4) { 1188 mutex_exit(&connp->conn_lock); 1189 (*connp->conn_upcalls->su_set_error) 1190 (connp->conn_upper_handle, error); 1191 goto done; 1192 } 1193 } else { 1194 icmp->icmp_delayed_error = error; 1195 *((sin_t *)&icmp->icmp_delayed_addr) = sin; 1196 } 1197 mutex_exit(&connp->conn_lock); 1198 } else { 1199 mp1 = mi_tpi_uderror_ind((char *)&sin, sizeof (sin_t), NULL, 0, 1200 error); 1201 if (mp1 != NULL) 1202 putnext(connp->conn_rq, mp1); 1203 } 1204 done: 1205 freemsg(mp); 1206 } 1207 1208 /* 1209 * icmp_icmp_error_ipv6 is called by icmp_icmp_error to process ICMP for IPv6. 1210 * Generates the appropriate T_UDERROR_IND for permanent (non-transient) errors. 1211 * Assumes that IP has pulled up all the extension headers as well as the 1212 * ICMPv6 header. 1213 */ 1214 static void 1215 icmp_icmp_error_ipv6(conn_t *connp, mblk_t *mp, ip_recv_attr_t *ira) 1216 { 1217 icmp6_t *icmp6; 1218 ip6_t *ip6h, *outer_ip6h; 1219 uint16_t iph_hdr_length; 1220 uint8_t *nexthdrp; 1221 sin6_t sin6; 1222 mblk_t *mp1; 1223 int error = 0; 1224 icmp_t *icmp = connp->conn_icmp; 1225 1226 outer_ip6h = (ip6_t *)mp->b_rptr; 1227 #ifdef DEBUG 1228 if (outer_ip6h->ip6_nxt != IPPROTO_ICMPV6) 1229 iph_hdr_length = ip_hdr_length_v6(mp, outer_ip6h); 1230 else 1231 iph_hdr_length = IPV6_HDR_LEN; 1232 ASSERT(iph_hdr_length == ira->ira_ip_hdr_length); 1233 #endif 1234 /* Skip past the outer IP and ICMP headers */ 1235 iph_hdr_length = ira->ira_ip_hdr_length; 1236 icmp6 = (icmp6_t *)&mp->b_rptr[iph_hdr_length]; 1237 1238 ip6h = (ip6_t *)&icmp6[1]; /* Inner IP header */ 1239 if (!ip_hdr_length_nexthdr_v6(mp, ip6h, &iph_hdr_length, &nexthdrp)) { 1240 freemsg(mp); 1241 return; 1242 } 1243 1244 switch (icmp6->icmp6_type) { 1245 case ICMP6_DST_UNREACH: 1246 switch (icmp6->icmp6_code) { 1247 case ICMP6_DST_UNREACH_NOPORT: 1248 error = ECONNREFUSED; 1249 break; 1250 case ICMP6_DST_UNREACH_ADMIN: 1251 case ICMP6_DST_UNREACH_NOROUTE: 1252 case ICMP6_DST_UNREACH_BEYONDSCOPE: 1253 case ICMP6_DST_UNREACH_ADDR: 1254 /* Transient errors */ 1255 break; 1256 default: 1257 break; 1258 } 1259 break; 1260 case ICMP6_PACKET_TOO_BIG: { 1261 struct T_unitdata_ind *tudi; 1262 struct T_opthdr *toh; 1263 size_t udi_size; 1264 mblk_t *newmp; 1265 t_scalar_t opt_length = sizeof (struct T_opthdr) + 1266 sizeof (struct ip6_mtuinfo); 1267 sin6_t *sin6; 1268 struct ip6_mtuinfo *mtuinfo; 1269 1270 /* 1271 * If the application has requested to receive path mtu 1272 * information, send up an empty message containing an 1273 * IPV6_PATHMTU ancillary data item. 1274 */ 1275 if (!connp->conn_ipv6_recvpathmtu) 1276 break; 1277 1278 udi_size = sizeof (struct T_unitdata_ind) + sizeof (sin6_t) + 1279 opt_length; 1280 if ((newmp = allocb(udi_size, BPRI_MED)) == NULL) { 1281 BUMP_MIB(&icmp->icmp_is->is_rawip_mib, rawipInErrors); 1282 break; 1283 } 1284 1285 /* 1286 * newmp->b_cont is left to NULL on purpose. This is an 1287 * empty message containing only ancillary data. 1288 */ 1289 newmp->b_datap->db_type = M_PROTO; 1290 tudi = (struct T_unitdata_ind *)newmp->b_rptr; 1291 newmp->b_wptr = (uchar_t *)tudi + udi_size; 1292 tudi->PRIM_type = T_UNITDATA_IND; 1293 tudi->SRC_length = sizeof (sin6_t); 1294 tudi->SRC_offset = sizeof (struct T_unitdata_ind); 1295 tudi->OPT_offset = tudi->SRC_offset + sizeof (sin6_t); 1296 tudi->OPT_length = opt_length; 1297 1298 sin6 = (sin6_t *)&tudi[1]; 1299 bzero(sin6, sizeof (sin6_t)); 1300 sin6->sin6_family = AF_INET6; 1301 sin6->sin6_addr = connp->conn_faddr_v6; 1302 1303 toh = (struct T_opthdr *)&sin6[1]; 1304 toh->level = IPPROTO_IPV6; 1305 toh->name = IPV6_PATHMTU; 1306 toh->len = opt_length; 1307 toh->status = 0; 1308 1309 mtuinfo = (struct ip6_mtuinfo *)&toh[1]; 1310 bzero(mtuinfo, sizeof (struct ip6_mtuinfo)); 1311 mtuinfo->ip6m_addr.sin6_family = AF_INET6; 1312 mtuinfo->ip6m_addr.sin6_addr = ip6h->ip6_dst; 1313 mtuinfo->ip6m_mtu = icmp6->icmp6_mtu; 1314 /* 1315 * We've consumed everything we need from the original 1316 * message. Free it, then send our empty message. 1317 */ 1318 freemsg(mp); 1319 icmp_ulp_recv(connp, newmp, msgdsize(newmp)); 1320 return; 1321 } 1322 case ICMP6_TIME_EXCEEDED: 1323 /* Transient errors */ 1324 break; 1325 case ICMP6_PARAM_PROB: 1326 /* If this corresponds to an ICMP_PROTOCOL_UNREACHABLE */ 1327 if (icmp6->icmp6_code == ICMP6_PARAMPROB_NEXTHEADER && 1328 (uchar_t *)ip6h + icmp6->icmp6_pptr == 1329 (uchar_t *)nexthdrp) { 1330 error = ECONNREFUSED; 1331 break; 1332 } 1333 break; 1334 } 1335 if (error == 0) { 1336 freemsg(mp); 1337 return; 1338 } 1339 1340 /* 1341 * Deliver T_UDERROR_IND when the application has asked for it. 1342 * The socket layer enables this automatically when connected. 1343 */ 1344 if (!connp->conn_dgram_errind) { 1345 freemsg(mp); 1346 return; 1347 } 1348 1349 sin6 = sin6_null; 1350 sin6.sin6_family = AF_INET6; 1351 sin6.sin6_addr = ip6h->ip6_dst; 1352 sin6.sin6_flowinfo = ip6h->ip6_vcf & ~IPV6_VERS_AND_FLOW_MASK; 1353 if (IPCL_IS_NONSTR(connp)) { 1354 mutex_enter(&connp->conn_lock); 1355 if (icmp->icmp_state == TS_DATA_XFER) { 1356 if (IN6_ARE_ADDR_EQUAL(&sin6.sin6_addr, 1357 &connp->conn_faddr_v6)) { 1358 mutex_exit(&connp->conn_lock); 1359 (*connp->conn_upcalls->su_set_error) 1360 (connp->conn_upper_handle, error); 1361 goto done; 1362 } 1363 } else { 1364 icmp->icmp_delayed_error = error; 1365 *((sin6_t *)&icmp->icmp_delayed_addr) = sin6; 1366 } 1367 mutex_exit(&connp->conn_lock); 1368 } else { 1369 mp1 = mi_tpi_uderror_ind((char *)&sin6, sizeof (sin6_t), 1370 NULL, 0, error); 1371 if (mp1 != NULL) 1372 putnext(connp->conn_rq, mp1); 1373 } 1374 done: 1375 freemsg(mp); 1376 } 1377 1378 /* 1379 * This routine responds to T_ADDR_REQ messages. It is called by icmp_wput. 1380 * The local address is filled in if endpoint is bound. The remote address 1381 * is filled in if remote address has been precified ("connected endpoint") 1382 * (The concept of connected CLTS sockets is alien to published TPI 1383 * but we support it anyway). 1384 */ 1385 static void 1386 icmp_addr_req(queue_t *q, mblk_t *mp) 1387 { 1388 struct sockaddr *sa; 1389 mblk_t *ackmp; 1390 struct T_addr_ack *taa; 1391 icmp_t *icmp = Q_TO_ICMP(q); 1392 conn_t *connp = icmp->icmp_connp; 1393 uint_t addrlen; 1394 1395 /* Make it large enough for worst case */ 1396 ackmp = reallocb(mp, sizeof (struct T_addr_ack) + 1397 2 * sizeof (sin6_t), 1); 1398 if (ackmp == NULL) { 1399 icmp_err_ack(q, mp, TSYSERR, ENOMEM); 1400 return; 1401 } 1402 taa = (struct T_addr_ack *)ackmp->b_rptr; 1403 1404 bzero(taa, sizeof (struct T_addr_ack)); 1405 ackmp->b_wptr = (uchar_t *)&taa[1]; 1406 1407 taa->PRIM_type = T_ADDR_ACK; 1408 ackmp->b_datap->db_type = M_PCPROTO; 1409 1410 if (connp->conn_family == AF_INET) 1411 addrlen = sizeof (sin_t); 1412 else 1413 addrlen = sizeof (sin6_t); 1414 1415 mutex_enter(&connp->conn_lock); 1416 /* 1417 * Note: Following code assumes 32 bit alignment of basic 1418 * data structures like sin_t and struct T_addr_ack. 1419 */ 1420 if (icmp->icmp_state != TS_UNBND) { 1421 /* 1422 * Fill in local address first 1423 */ 1424 taa->LOCADDR_offset = sizeof (*taa); 1425 taa->LOCADDR_length = addrlen; 1426 sa = (struct sockaddr *)&taa[1]; 1427 (void) conn_getsockname(connp, sa, &addrlen); 1428 ackmp->b_wptr += addrlen; 1429 } 1430 if (icmp->icmp_state == TS_DATA_XFER) { 1431 /* 1432 * connected, fill remote address too 1433 */ 1434 taa->REMADDR_length = addrlen; 1435 /* assumed 32-bit alignment */ 1436 taa->REMADDR_offset = taa->LOCADDR_offset + taa->LOCADDR_length; 1437 sa = (struct sockaddr *)(ackmp->b_rptr + taa->REMADDR_offset); 1438 (void) conn_getpeername(connp, sa, &addrlen); 1439 ackmp->b_wptr += addrlen; 1440 } 1441 mutex_exit(&connp->conn_lock); 1442 ASSERT(ackmp->b_wptr <= ackmp->b_datap->db_lim); 1443 qreply(q, ackmp); 1444 } 1445 1446 static void 1447 icmp_copy_info(struct T_info_ack *tap, icmp_t *icmp) 1448 { 1449 conn_t *connp = icmp->icmp_connp; 1450 1451 *tap = icmp_g_t_info_ack; 1452 1453 if (connp->conn_family == AF_INET6) 1454 tap->ADDR_size = sizeof (sin6_t); 1455 else 1456 tap->ADDR_size = sizeof (sin_t); 1457 tap->CURRENT_state = icmp->icmp_state; 1458 tap->OPT_size = icmp_max_optsize; 1459 } 1460 1461 static void 1462 icmp_do_capability_ack(icmp_t *icmp, struct T_capability_ack *tcap, 1463 t_uscalar_t cap_bits1) 1464 { 1465 tcap->CAP_bits1 = 0; 1466 1467 if (cap_bits1 & TC1_INFO) { 1468 icmp_copy_info(&tcap->INFO_ack, icmp); 1469 tcap->CAP_bits1 |= TC1_INFO; 1470 } 1471 } 1472 1473 /* 1474 * This routine responds to T_CAPABILITY_REQ messages. It is called by 1475 * icmp_wput. Much of the T_CAPABILITY_ACK information is copied from 1476 * icmp_g_t_info_ack. The current state of the stream is copied from 1477 * icmp_state. 1478 */ 1479 static void 1480 icmp_capability_req(queue_t *q, mblk_t *mp) 1481 { 1482 icmp_t *icmp = Q_TO_ICMP(q); 1483 t_uscalar_t cap_bits1; 1484 struct T_capability_ack *tcap; 1485 1486 cap_bits1 = ((struct T_capability_req *)mp->b_rptr)->CAP_bits1; 1487 1488 mp = tpi_ack_alloc(mp, sizeof (struct T_capability_ack), 1489 mp->b_datap->db_type, T_CAPABILITY_ACK); 1490 if (!mp) 1491 return; 1492 1493 tcap = (struct T_capability_ack *)mp->b_rptr; 1494 1495 icmp_do_capability_ack(icmp, tcap, cap_bits1); 1496 1497 qreply(q, mp); 1498 } 1499 1500 /* 1501 * This routine responds to T_INFO_REQ messages. It is called by icmp_wput. 1502 * Most of the T_INFO_ACK information is copied from icmp_g_t_info_ack. 1503 * The current state of the stream is copied from icmp_state. 1504 */ 1505 static void 1506 icmp_info_req(queue_t *q, mblk_t *mp) 1507 { 1508 icmp_t *icmp = Q_TO_ICMP(q); 1509 1510 /* Create a T_INFO_ACK message. */ 1511 mp = tpi_ack_alloc(mp, sizeof (struct T_info_ack), M_PCPROTO, 1512 T_INFO_ACK); 1513 if (!mp) 1514 return; 1515 icmp_copy_info((struct T_info_ack *)mp->b_rptr, icmp); 1516 qreply(q, mp); 1517 } 1518 1519 static int 1520 icmp_tpi_open(queue_t *q, dev_t *devp, int flag, int sflag, cred_t *credp, 1521 int family) 1522 { 1523 conn_t *connp; 1524 dev_t conn_dev; 1525 int error; 1526 1527 /* If the stream is already open, return immediately. */ 1528 if (q->q_ptr != NULL) 1529 return (0); 1530 1531 if (sflag == MODOPEN) 1532 return (EINVAL); 1533 1534 /* 1535 * Since ICMP is not used so heavily, allocating from the small 1536 * arena should be sufficient. 1537 */ 1538 if ((conn_dev = inet_minor_alloc(ip_minor_arena_sa)) == 0) { 1539 return (EBUSY); 1540 } 1541 1542 if (flag & SO_FALLBACK) { 1543 /* 1544 * Non streams socket needs a stream to fallback to 1545 */ 1546 RD(q)->q_ptr = (void *)conn_dev; 1547 WR(q)->q_qinfo = &icmp_fallback_sock_winit; 1548 WR(q)->q_ptr = (void *)ip_minor_arena_sa; 1549 qprocson(q); 1550 return (0); 1551 } 1552 1553 connp = rawip_do_open(family, credp, &error, KM_SLEEP); 1554 if (connp == NULL) { 1555 ASSERT(error != 0); 1556 inet_minor_free(ip_minor_arena_sa, connp->conn_dev); 1557 return (error); 1558 } 1559 1560 *devp = makedevice(getemajor(*devp), (minor_t)conn_dev); 1561 connp->conn_dev = conn_dev; 1562 connp->conn_minor_arena = ip_minor_arena_sa; 1563 1564 /* 1565 * Initialize the icmp_t structure for this stream. 1566 */ 1567 q->q_ptr = connp; 1568 WR(q)->q_ptr = connp; 1569 connp->conn_rq = q; 1570 connp->conn_wq = WR(q); 1571 1572 WR(q)->q_hiwat = connp->conn_sndbuf; 1573 WR(q)->q_lowat = connp->conn_sndlowat; 1574 1575 qprocson(q); 1576 1577 /* Set the Stream head write offset. */ 1578 (void) proto_set_tx_wroff(q, connp, connp->conn_wroff); 1579 (void) proto_set_rx_hiwat(connp->conn_rq, connp, connp->conn_rcvbuf); 1580 1581 mutex_enter(&connp->conn_lock); 1582 connp->conn_state_flags &= ~CONN_INCIPIENT; 1583 mutex_exit(&connp->conn_lock); 1584 1585 icmp_bind_proto(connp->conn_icmp); 1586 1587 return (0); 1588 } 1589 1590 /* For /dev/icmp aka AF_INET open */ 1591 static int 1592 icmp_openv4(queue_t *q, dev_t *devp, int flag, int sflag, cred_t *credp) 1593 { 1594 return (icmp_tpi_open(q, devp, flag, sflag, credp, AF_INET)); 1595 } 1596 1597 /* For /dev/icmp6 aka AF_INET6 open */ 1598 static int 1599 icmp_openv6(queue_t *q, dev_t *devp, int flag, int sflag, cred_t *credp) 1600 { 1601 return (icmp_tpi_open(q, devp, flag, sflag, credp, AF_INET6)); 1602 } 1603 1604 /* 1605 * This is the open routine for icmp. It allocates a icmp_t structure for 1606 * the stream and, on the first open of the module, creates an ND table. 1607 */ 1608 static conn_t * 1609 rawip_do_open(int family, cred_t *credp, int *err, int flags) 1610 { 1611 icmp_t *icmp; 1612 conn_t *connp; 1613 zoneid_t zoneid; 1614 netstack_t *ns; 1615 icmp_stack_t *is; 1616 int len; 1617 boolean_t isv6 = B_FALSE; 1618 1619 *err = secpolicy_net_icmpaccess(credp); 1620 if (*err != 0) 1621 return (NULL); 1622 1623 if (family == AF_INET6) 1624 isv6 = B_TRUE; 1625 1626 ns = netstack_find_by_cred(credp); 1627 ASSERT(ns != NULL); 1628 is = ns->netstack_icmp; 1629 ASSERT(is != NULL); 1630 1631 /* 1632 * For exclusive stacks we set the zoneid to zero 1633 * to make ICMP operate as if in the global zone. 1634 */ 1635 if (ns->netstack_stackid != GLOBAL_NETSTACKID) 1636 zoneid = GLOBAL_ZONEID; 1637 else 1638 zoneid = crgetzoneid(credp); 1639 1640 ASSERT(flags == KM_SLEEP || flags == KM_NOSLEEP); 1641 1642 connp = ipcl_conn_create(IPCL_RAWIPCONN, flags, ns); 1643 icmp = connp->conn_icmp; 1644 1645 /* 1646 * ipcl_conn_create did a netstack_hold. Undo the hold that was 1647 * done by netstack_find_by_cred() 1648 */ 1649 netstack_rele(ns); 1650 1651 /* 1652 * Since this conn_t/icmp_t is not yet visible to anybody else we don't 1653 * need to lock anything. 1654 */ 1655 ASSERT(connp->conn_proto == IPPROTO_ICMP); 1656 ASSERT(connp->conn_icmp == icmp); 1657 ASSERT(icmp->icmp_connp == connp); 1658 1659 /* Set the initial state of the stream and the privilege status. */ 1660 icmp->icmp_state = TS_UNBND; 1661 connp->conn_ixa->ixa_flags |= IXAF_VERIFY_SOURCE; 1662 if (isv6) { 1663 connp->conn_family = AF_INET6; 1664 connp->conn_ipversion = IPV6_VERSION; 1665 connp->conn_ixa->ixa_flags &= ~IXAF_IS_IPV4; 1666 connp->conn_proto = IPPROTO_ICMPV6; 1667 /* May be changed by a SO_PROTOTYPE socket option. */ 1668 connp->conn_proto = IPPROTO_ICMPV6; 1669 connp->conn_ixa->ixa_protocol = connp->conn_proto; 1670 connp->conn_ixa->ixa_raw_cksum_offset = 2; 1671 connp->conn_default_ttl = is->is_ipv6_hoplimit; 1672 len = sizeof (ip6_t); 1673 } else { 1674 connp->conn_family = AF_INET; 1675 connp->conn_ipversion = IPV4_VERSION; 1676 connp->conn_ixa->ixa_flags |= IXAF_IS_IPV4; 1677 /* May be changed by a SO_PROTOTYPE socket option. */ 1678 connp->conn_proto = IPPROTO_ICMP; 1679 connp->conn_ixa->ixa_protocol = connp->conn_proto; 1680 connp->conn_default_ttl = is->is_ipv4_ttl; 1681 len = sizeof (ipha_t); 1682 } 1683 connp->conn_xmit_ipp.ipp_unicast_hops = connp->conn_default_ttl; 1684 1685 connp->conn_ixa->ixa_multicast_ttl = IP_DEFAULT_MULTICAST_TTL; 1686 1687 /* 1688 * For the socket of protocol IPPROTO_RAW or when IP_HDRINCL is set, 1689 * the checksum is provided in the pre-built packet. We clear 1690 * IXAF_SET_ULP_CKSUM to tell IP that the application has sent a 1691 * complete IP header and not to compute the transport checksum. 1692 */ 1693 connp->conn_ixa->ixa_flags |= IXAF_MULTICAST_LOOP | IXAF_SET_ULP_CKSUM; 1694 /* conn_allzones can not be set this early, hence no IPCL_ZONEID */ 1695 connp->conn_ixa->ixa_zoneid = zoneid; 1696 1697 connp->conn_zoneid = zoneid; 1698 1699 /* 1700 * If the caller has the process-wide flag set, then default to MAC 1701 * exempt mode. This allows read-down to unlabeled hosts. 1702 */ 1703 if (getpflags(NET_MAC_AWARE, credp) != 0) 1704 connp->conn_mac_mode = CONN_MAC_AWARE; 1705 1706 connp->conn_zone_is_global = (crgetzoneid(credp) == GLOBAL_ZONEID); 1707 1708 icmp->icmp_is = is; 1709 1710 connp->conn_rcvbuf = is->is_recv_hiwat; 1711 connp->conn_sndbuf = is->is_xmit_hiwat; 1712 connp->conn_sndlowat = is->is_xmit_lowat; 1713 connp->conn_rcvlowat = icmp_mod_info.mi_lowat; 1714 1715 connp->conn_wroff = len + is->is_wroff_extra; 1716 connp->conn_so_type = SOCK_RAW; 1717 1718 connp->conn_recv = icmp_input; 1719 connp->conn_recvicmp = icmp_icmp_input; 1720 crhold(credp); 1721 connp->conn_cred = credp; 1722 connp->conn_cpid = curproc->p_pid; 1723 connp->conn_open_time = ddi_get_lbolt64(); 1724 /* Cache things in ixa without an extra refhold */ 1725 connp->conn_ixa->ixa_cred = connp->conn_cred; 1726 connp->conn_ixa->ixa_cpid = connp->conn_cpid; 1727 if (is_system_labeled()) 1728 connp->conn_ixa->ixa_tsl = crgetlabel(connp->conn_cred); 1729 1730 connp->conn_flow_cntrld = B_FALSE; 1731 1732 if (is->is_pmtu_discovery) 1733 connp->conn_ixa->ixa_flags |= IXAF_PMTU_DISCOVERY; 1734 1735 return (connp); 1736 } 1737 1738 /* 1739 * Which ICMP options OK to set through T_UNITDATA_REQ... 1740 */ 1741 /* ARGSUSED */ 1742 static boolean_t 1743 icmp_opt_allow_udr_set(t_scalar_t level, t_scalar_t name) 1744 { 1745 return (B_TRUE); 1746 } 1747 1748 /* 1749 * This routine gets default values of certain options whose default 1750 * values are maintained by protcol specific code 1751 */ 1752 int 1753 icmp_opt_default(queue_t *q, t_scalar_t level, t_scalar_t name, uchar_t *ptr) 1754 { 1755 icmp_t *icmp = Q_TO_ICMP(q); 1756 icmp_stack_t *is = icmp->icmp_is; 1757 int *i1 = (int *)ptr; 1758 1759 switch (level) { 1760 case IPPROTO_IP: 1761 switch (name) { 1762 case IP_MULTICAST_TTL: 1763 *ptr = (uchar_t)IP_DEFAULT_MULTICAST_TTL; 1764 return (sizeof (uchar_t)); 1765 case IP_MULTICAST_LOOP: 1766 *ptr = (uchar_t)IP_DEFAULT_MULTICAST_LOOP; 1767 return (sizeof (uchar_t)); 1768 } 1769 break; 1770 case IPPROTO_IPV6: 1771 switch (name) { 1772 case IPV6_MULTICAST_HOPS: 1773 *i1 = IP_DEFAULT_MULTICAST_TTL; 1774 return (sizeof (int)); 1775 case IPV6_MULTICAST_LOOP: 1776 *i1 = IP_DEFAULT_MULTICAST_LOOP; 1777 return (sizeof (int)); 1778 case IPV6_UNICAST_HOPS: 1779 *i1 = is->is_ipv6_hoplimit; 1780 return (sizeof (int)); 1781 } 1782 break; 1783 case IPPROTO_ICMPV6: 1784 switch (name) { 1785 case ICMP6_FILTER: 1786 /* Make it look like "pass all" */ 1787 ICMP6_FILTER_SETPASSALL((icmp6_filter_t *)ptr); 1788 return (sizeof (icmp6_filter_t)); 1789 } 1790 break; 1791 } 1792 return (-1); 1793 } 1794 1795 /* 1796 * This routine retrieves the current status of socket options. 1797 * It returns the size of the option retrieved, or -1. 1798 */ 1799 int 1800 icmp_opt_get(conn_t *connp, int level, int name, uchar_t *ptr) 1801 { 1802 icmp_t *icmp = connp->conn_icmp; 1803 int *i1 = (int *)ptr; 1804 conn_opt_arg_t coas; 1805 int retval; 1806 1807 coas.coa_connp = connp; 1808 coas.coa_ixa = connp->conn_ixa; 1809 coas.coa_ipp = &connp->conn_xmit_ipp; 1810 coas.coa_ancillary = B_FALSE; 1811 coas.coa_changed = 0; 1812 1813 /* 1814 * We assume that the optcom framework has checked for the set 1815 * of levels and names that are supported, hence we don't worry 1816 * about rejecting based on that. 1817 * First check for ICMP specific handling, then pass to common routine. 1818 */ 1819 switch (level) { 1820 case IPPROTO_IP: 1821 /* 1822 * Only allow IPv4 option processing on IPv4 sockets. 1823 */ 1824 if (connp->conn_family != AF_INET) 1825 return (-1); 1826 1827 switch (name) { 1828 case IP_OPTIONS: 1829 case T_IP_OPTIONS: 1830 /* Options are passed up with each packet */ 1831 return (0); 1832 case IP_HDRINCL: 1833 mutex_enter(&connp->conn_lock); 1834 *i1 = (int)icmp->icmp_hdrincl; 1835 mutex_exit(&connp->conn_lock); 1836 return (sizeof (int)); 1837 } 1838 break; 1839 1840 case IPPROTO_IPV6: 1841 /* 1842 * Only allow IPv6 option processing on native IPv6 sockets. 1843 */ 1844 if (connp->conn_family != AF_INET6) 1845 return (-1); 1846 1847 switch (name) { 1848 case IPV6_CHECKSUM: 1849 /* 1850 * Return offset or -1 if no checksum offset. 1851 * Does not apply to IPPROTO_ICMPV6 1852 */ 1853 if (connp->conn_proto == IPPROTO_ICMPV6) 1854 return (-1); 1855 1856 mutex_enter(&connp->conn_lock); 1857 if (connp->conn_ixa->ixa_flags & IXAF_SET_RAW_CKSUM) 1858 *i1 = connp->conn_ixa->ixa_raw_cksum_offset; 1859 else 1860 *i1 = -1; 1861 mutex_exit(&connp->conn_lock); 1862 return (sizeof (int)); 1863 } 1864 break; 1865 1866 case IPPROTO_ICMPV6: 1867 /* 1868 * Only allow IPv6 option processing on native IPv6 sockets. 1869 */ 1870 if (connp->conn_family != AF_INET6) 1871 return (-1); 1872 1873 if (connp->conn_proto != IPPROTO_ICMPV6) 1874 return (-1); 1875 1876 switch (name) { 1877 case ICMP6_FILTER: 1878 mutex_enter(&connp->conn_lock); 1879 if (icmp->icmp_filter == NULL) { 1880 /* Make it look like "pass all" */ 1881 ICMP6_FILTER_SETPASSALL((icmp6_filter_t *)ptr); 1882 } else { 1883 (void) bcopy(icmp->icmp_filter, ptr, 1884 sizeof (icmp6_filter_t)); 1885 } 1886 mutex_exit(&connp->conn_lock); 1887 return (sizeof (icmp6_filter_t)); 1888 } 1889 } 1890 mutex_enter(&connp->conn_lock); 1891 retval = conn_opt_get(&coas, level, name, ptr); 1892 mutex_exit(&connp->conn_lock); 1893 return (retval); 1894 } 1895 1896 /* 1897 * This routine retrieves the current status of socket options. 1898 * It returns the size of the option retrieved, or -1. 1899 */ 1900 int 1901 icmp_tpi_opt_get(queue_t *q, int level, int name, uchar_t *ptr) 1902 { 1903 conn_t *connp = Q_TO_CONN(q); 1904 int err; 1905 1906 err = icmp_opt_get(connp, level, name, ptr); 1907 return (err); 1908 } 1909 1910 /* 1911 * This routine sets socket options. 1912 */ 1913 int 1914 icmp_do_opt_set(conn_opt_arg_t *coa, int level, int name, 1915 uint_t inlen, uchar_t *invalp, cred_t *cr, boolean_t checkonly) 1916 { 1917 conn_t *connp = coa->coa_connp; 1918 ip_xmit_attr_t *ixa = coa->coa_ixa; 1919 icmp_t *icmp = connp->conn_icmp; 1920 icmp_stack_t *is = icmp->icmp_is; 1921 int *i1 = (int *)invalp; 1922 boolean_t onoff = (*i1 == 0) ? 0 : 1; 1923 int error; 1924 1925 ASSERT(MUTEX_NOT_HELD(&coa->coa_connp->conn_lock)); 1926 1927 /* 1928 * For fixed length options, no sanity check 1929 * of passed in length is done. It is assumed *_optcom_req() 1930 * routines do the right thing. 1931 */ 1932 1933 switch (level) { 1934 case SOL_SOCKET: 1935 switch (name) { 1936 case SO_PROTOTYPE: 1937 if ((*i1 & 0xFF) != IPPROTO_ICMP && 1938 (*i1 & 0xFF) != IPPROTO_ICMPV6 && 1939 secpolicy_net_rawaccess(cr) != 0) { 1940 return (EACCES); 1941 } 1942 if (checkonly) 1943 break; 1944 1945 mutex_enter(&connp->conn_lock); 1946 connp->conn_proto = *i1 & 0xFF; 1947 ixa->ixa_protocol = connp->conn_proto; 1948 if ((connp->conn_proto == IPPROTO_RAW || 1949 connp->conn_proto == IPPROTO_IGMP) && 1950 connp->conn_family == AF_INET) { 1951 icmp->icmp_hdrincl = 1; 1952 ixa->ixa_flags &= ~IXAF_SET_ULP_CKSUM; 1953 } else if (connp->conn_proto == IPPROTO_UDP || 1954 connp->conn_proto == IPPROTO_TCP || 1955 connp->conn_proto == IPPROTO_SCTP) { 1956 /* Used by test applications like psh */ 1957 icmp->icmp_hdrincl = 0; 1958 ixa->ixa_flags &= ~IXAF_SET_ULP_CKSUM; 1959 } else { 1960 icmp->icmp_hdrincl = 0; 1961 ixa->ixa_flags |= IXAF_SET_ULP_CKSUM; 1962 } 1963 1964 if (connp->conn_family == AF_INET6 && 1965 connp->conn_proto == IPPROTO_ICMPV6) { 1966 /* Set offset for icmp6_cksum */ 1967 ixa->ixa_flags &= ~IXAF_SET_RAW_CKSUM; 1968 ixa->ixa_raw_cksum_offset = 2; 1969 } 1970 if (icmp->icmp_filter != NULL && 1971 connp->conn_proto != IPPROTO_ICMPV6) { 1972 kmem_free(icmp->icmp_filter, 1973 sizeof (icmp6_filter_t)); 1974 icmp->icmp_filter = NULL; 1975 } 1976 mutex_exit(&connp->conn_lock); 1977 1978 coa->coa_changed |= COA_HEADER_CHANGED; 1979 /* 1980 * For SCTP, we don't use icmp_bind_proto() for 1981 * raw socket binding. 1982 */ 1983 if (connp->conn_proto == IPPROTO_SCTP) 1984 return (0); 1985 1986 coa->coa_changed |= COA_ICMP_BIND_NEEDED; 1987 return (0); 1988 1989 case SO_SNDBUF: 1990 if (*i1 > is->is_max_buf) { 1991 return (ENOBUFS); 1992 } 1993 break; 1994 case SO_RCVBUF: 1995 if (*i1 > is->is_max_buf) { 1996 return (ENOBUFS); 1997 } 1998 break; 1999 } 2000 break; 2001 2002 case IPPROTO_IP: 2003 /* 2004 * Only allow IPv4 option processing on IPv4 sockets. 2005 */ 2006 if (connp->conn_family != AF_INET) 2007 return (EINVAL); 2008 2009 switch (name) { 2010 case IP_HDRINCL: 2011 if (!checkonly) { 2012 mutex_enter(&connp->conn_lock); 2013 icmp->icmp_hdrincl = onoff; 2014 if (onoff) 2015 ixa->ixa_flags &= ~IXAF_SET_ULP_CKSUM; 2016 else 2017 ixa->ixa_flags |= IXAF_SET_ULP_CKSUM; 2018 mutex_exit(&connp->conn_lock); 2019 } 2020 break; 2021 } 2022 break; 2023 2024 case IPPROTO_IPV6: 2025 if (connp->conn_family != AF_INET6) 2026 return (EINVAL); 2027 2028 switch (name) { 2029 case IPV6_CHECKSUM: 2030 /* 2031 * Integer offset into the user data of where the 2032 * checksum is located. 2033 * Offset of -1 disables option. 2034 * Does not apply to IPPROTO_ICMPV6. 2035 */ 2036 if (connp->conn_proto == IPPROTO_ICMPV6 || 2037 coa->coa_ancillary) { 2038 return (EINVAL); 2039 } 2040 if ((*i1 != -1) && ((*i1 < 0) || (*i1 & 0x1) != 0)) { 2041 /* Negative or not 16 bit aligned offset */ 2042 return (EINVAL); 2043 } 2044 if (checkonly) 2045 break; 2046 2047 mutex_enter(&connp->conn_lock); 2048 if (*i1 == -1) { 2049 ixa->ixa_flags &= ~IXAF_SET_RAW_CKSUM; 2050 ixa->ixa_raw_cksum_offset = 0; 2051 ixa->ixa_flags &= ~IXAF_SET_ULP_CKSUM; 2052 } else { 2053 ixa->ixa_flags |= IXAF_SET_RAW_CKSUM; 2054 ixa->ixa_raw_cksum_offset = *i1; 2055 ixa->ixa_flags |= IXAF_SET_ULP_CKSUM; 2056 } 2057 mutex_exit(&connp->conn_lock); 2058 break; 2059 } 2060 break; 2061 2062 case IPPROTO_ICMPV6: 2063 /* 2064 * Only allow IPv6 option processing on IPv6 sockets. 2065 */ 2066 if (connp->conn_family != AF_INET6) 2067 return (EINVAL); 2068 if (connp->conn_proto != IPPROTO_ICMPV6) 2069 return (EINVAL); 2070 2071 switch (name) { 2072 case ICMP6_FILTER: 2073 if (checkonly) 2074 break; 2075 2076 if ((inlen != 0) && 2077 (inlen != sizeof (icmp6_filter_t))) 2078 return (EINVAL); 2079 2080 mutex_enter(&connp->conn_lock); 2081 if (inlen == 0) { 2082 if (icmp->icmp_filter != NULL) { 2083 kmem_free(icmp->icmp_filter, 2084 sizeof (icmp6_filter_t)); 2085 icmp->icmp_filter = NULL; 2086 } 2087 } else { 2088 if (icmp->icmp_filter == NULL) { 2089 icmp->icmp_filter = kmem_alloc( 2090 sizeof (icmp6_filter_t), 2091 KM_NOSLEEP); 2092 if (icmp->icmp_filter == NULL) { 2093 mutex_exit(&connp->conn_lock); 2094 return (ENOBUFS); 2095 } 2096 } 2097 (void) bcopy(invalp, icmp->icmp_filter, inlen); 2098 } 2099 mutex_exit(&connp->conn_lock); 2100 break; 2101 } 2102 break; 2103 } 2104 error = conn_opt_set(coa, level, name, inlen, invalp, 2105 checkonly, cr); 2106 return (error); 2107 } 2108 2109 /* 2110 * This routine sets socket options. 2111 */ 2112 int 2113 icmp_opt_set(conn_t *connp, uint_t optset_context, int level, int name, 2114 uint_t inlen, uchar_t *invalp, uint_t *outlenp, uchar_t *outvalp, 2115 void *thisdg_attrs, cred_t *cr) 2116 { 2117 icmp_t *icmp = connp->conn_icmp; 2118 int err; 2119 conn_opt_arg_t coas, *coa; 2120 boolean_t checkonly; 2121 icmp_stack_t *is = icmp->icmp_is; 2122 2123 switch (optset_context) { 2124 case SETFN_OPTCOM_CHECKONLY: 2125 checkonly = B_TRUE; 2126 /* 2127 * Note: Implies T_CHECK semantics for T_OPTCOM_REQ 2128 * inlen != 0 implies value supplied and 2129 * we have to "pretend" to set it. 2130 * inlen == 0 implies that there is no 2131 * value part in T_CHECK request and just validation 2132 * done elsewhere should be enough, we just return here. 2133 */ 2134 if (inlen == 0) { 2135 *outlenp = 0; 2136 return (0); 2137 } 2138 break; 2139 case SETFN_OPTCOM_NEGOTIATE: 2140 checkonly = B_FALSE; 2141 break; 2142 case SETFN_UD_NEGOTIATE: 2143 case SETFN_CONN_NEGOTIATE: 2144 checkonly = B_FALSE; 2145 /* 2146 * Negotiating local and "association-related" options 2147 * through T_UNITDATA_REQ. 2148 * 2149 * Following routine can filter out ones we do not 2150 * want to be "set" this way. 2151 */ 2152 if (!icmp_opt_allow_udr_set(level, name)) { 2153 *outlenp = 0; 2154 return (EINVAL); 2155 } 2156 break; 2157 default: 2158 /* 2159 * We should never get here 2160 */ 2161 *outlenp = 0; 2162 return (EINVAL); 2163 } 2164 2165 ASSERT((optset_context != SETFN_OPTCOM_CHECKONLY) || 2166 (optset_context == SETFN_OPTCOM_CHECKONLY && inlen != 0)); 2167 2168 if (thisdg_attrs != NULL) { 2169 /* Options from T_UNITDATA_REQ */ 2170 coa = (conn_opt_arg_t *)thisdg_attrs; 2171 ASSERT(coa->coa_connp == connp); 2172 ASSERT(coa->coa_ixa != NULL); 2173 ASSERT(coa->coa_ipp != NULL); 2174 ASSERT(coa->coa_ancillary); 2175 } else { 2176 coa = &coas; 2177 coas.coa_connp = connp; 2178 /* Get a reference on conn_ixa to prevent concurrent mods */ 2179 coas.coa_ixa = conn_get_ixa(connp, B_TRUE); 2180 if (coas.coa_ixa == NULL) { 2181 *outlenp = 0; 2182 return (ENOMEM); 2183 } 2184 coas.coa_ipp = &connp->conn_xmit_ipp; 2185 coas.coa_ancillary = B_FALSE; 2186 coas.coa_changed = 0; 2187 } 2188 2189 err = icmp_do_opt_set(coa, level, name, inlen, invalp, 2190 cr, checkonly); 2191 if (err != 0) { 2192 errout: 2193 if (!coa->coa_ancillary) 2194 ixa_refrele(coa->coa_ixa); 2195 *outlenp = 0; 2196 return (err); 2197 } 2198 2199 /* 2200 * Common case of OK return with outval same as inval. 2201 */ 2202 if (invalp != outvalp) { 2203 /* don't trust bcopy for identical src/dst */ 2204 (void) bcopy(invalp, outvalp, inlen); 2205 } 2206 *outlenp = inlen; 2207 2208 /* 2209 * If this was not ancillary data, then we rebuild the headers, 2210 * update the IRE/NCE, and IPsec as needed. 2211 * Since the label depends on the destination we go through 2212 * ip_set_destination first. 2213 */ 2214 if (coa->coa_ancillary) { 2215 return (0); 2216 } 2217 2218 if (coa->coa_changed & COA_ROUTE_CHANGED) { 2219 in6_addr_t saddr, faddr, nexthop; 2220 in_port_t fport; 2221 2222 /* 2223 * We clear lastdst to make sure we pick up the change 2224 * next time sending. 2225 * If we are connected we re-cache the information. 2226 * We ignore errors to preserve BSD behavior. 2227 * Note that we don't redo IPsec policy lookup here 2228 * since the final destination (or source) didn't change. 2229 */ 2230 mutex_enter(&connp->conn_lock); 2231 connp->conn_v6lastdst = ipv6_all_zeros; 2232 2233 ip_attr_nexthop(coa->coa_ipp, coa->coa_ixa, 2234 &connp->conn_faddr_v6, &nexthop); 2235 saddr = connp->conn_saddr_v6; 2236 faddr = connp->conn_faddr_v6; 2237 fport = connp->conn_fport; 2238 mutex_exit(&connp->conn_lock); 2239 2240 if (!IN6_IS_ADDR_UNSPECIFIED(&faddr) && 2241 !IN6_IS_ADDR_V4MAPPED_ANY(&faddr)) { 2242 (void) ip_attr_connect(connp, coa->coa_ixa, 2243 &saddr, &faddr, &nexthop, fport, NULL, NULL, 2244 IPDF_ALLOW_MCBC | IPDF_VERIFY_DST); 2245 } 2246 } 2247 2248 ixa_refrele(coa->coa_ixa); 2249 2250 if (coa->coa_changed & COA_HEADER_CHANGED) { 2251 /* 2252 * Rebuild the header template if we are connected. 2253 * Otherwise clear conn_v6lastdst so we rebuild the header 2254 * in the data path. 2255 */ 2256 mutex_enter(&connp->conn_lock); 2257 if (!IN6_IS_ADDR_UNSPECIFIED(&connp->conn_faddr_v6) && 2258 !IN6_IS_ADDR_V4MAPPED_ANY(&connp->conn_faddr_v6)) { 2259 err = icmp_build_hdr_template(connp, 2260 &connp->conn_saddr_v6, &connp->conn_faddr_v6, 2261 connp->conn_flowinfo); 2262 if (err != 0) { 2263 mutex_exit(&connp->conn_lock); 2264 return (err); 2265 } 2266 } else { 2267 connp->conn_v6lastdst = ipv6_all_zeros; 2268 } 2269 mutex_exit(&connp->conn_lock); 2270 } 2271 if (coa->coa_changed & COA_RCVBUF_CHANGED) { 2272 (void) proto_set_rx_hiwat(connp->conn_rq, connp, 2273 connp->conn_rcvbuf); 2274 } 2275 if ((coa->coa_changed & COA_SNDBUF_CHANGED) && !IPCL_IS_NONSTR(connp)) { 2276 connp->conn_wq->q_hiwat = connp->conn_sndbuf; 2277 } 2278 if (coa->coa_changed & COA_WROFF_CHANGED) { 2279 /* Increase wroff if needed */ 2280 uint_t wroff; 2281 2282 mutex_enter(&connp->conn_lock); 2283 wroff = connp->conn_ht_iphc_allocated + is->is_wroff_extra; 2284 if (wroff > connp->conn_wroff) { 2285 connp->conn_wroff = wroff; 2286 mutex_exit(&connp->conn_lock); 2287 (void) proto_set_tx_wroff(connp->conn_rq, connp, wroff); 2288 } else { 2289 mutex_exit(&connp->conn_lock); 2290 } 2291 } 2292 if (coa->coa_changed & COA_ICMP_BIND_NEEDED) { 2293 icmp_bind_proto(icmp); 2294 } 2295 return (err); 2296 } 2297 2298 /* This routine sets socket options. */ 2299 int 2300 icmp_tpi_opt_set(queue_t *q, uint_t optset_context, int level, int name, 2301 uint_t inlen, uchar_t *invalp, uint_t *outlenp, uchar_t *outvalp, 2302 void *thisdg_attrs, cred_t *cr) 2303 { 2304 conn_t *connp = Q_TO_CONN(q); 2305 int error; 2306 2307 error = icmp_opt_set(connp, optset_context, level, name, inlen, invalp, 2308 outlenp, outvalp, thisdg_attrs, cr); 2309 return (error); 2310 } 2311 2312 /* 2313 * Setup IP headers. 2314 * 2315 * Note that IP_HDRINCL has ipha_protocol that is different than conn_proto, 2316 * but icmp_output_hdrincl restores ipha_protocol once we return. 2317 */ 2318 mblk_t * 2319 icmp_prepend_hdr(conn_t *connp, ip_xmit_attr_t *ixa, const ip_pkt_t *ipp, 2320 const in6_addr_t *v6src, const in6_addr_t *v6dst, uint32_t flowinfo, 2321 mblk_t *data_mp, int *errorp) 2322 { 2323 mblk_t *mp; 2324 icmp_stack_t *is = connp->conn_netstack->netstack_icmp; 2325 uint_t data_len; 2326 uint32_t cksum; 2327 2328 data_len = msgdsize(data_mp); 2329 mp = conn_prepend_hdr(ixa, ipp, v6src, v6dst, connp->conn_proto, 2330 flowinfo, 0, data_mp, data_len, is->is_wroff_extra, &cksum, errorp); 2331 if (mp == NULL) { 2332 ASSERT(*errorp != 0); 2333 return (NULL); 2334 } 2335 2336 ixa->ixa_pktlen = data_len + ixa->ixa_ip_hdr_length; 2337 2338 /* 2339 * If there was a routing option/header then conn_prepend_hdr 2340 * has massaged it and placed the pseudo-header checksum difference 2341 * in the cksum argument. 2342 * 2343 * Prepare for ICMPv6 checksum done in IP. 2344 * 2345 * We make it easy for IP to include our pseudo header 2346 * by putting our length (and any routing header adjustment) 2347 * in the ICMPv6 checksum field. 2348 * The IP source, destination, and length have already been set by 2349 * conn_prepend_hdr. 2350 */ 2351 cksum += data_len; 2352 cksum = (cksum >> 16) + (cksum & 0xFFFF); 2353 ASSERT(cksum < 0x10000); 2354 2355 if (ixa->ixa_flags & IXAF_IS_IPV4) { 2356 ipha_t *ipha = (ipha_t *)mp->b_rptr; 2357 2358 ASSERT(ntohs(ipha->ipha_length) == ixa->ixa_pktlen); 2359 } else { 2360 ip6_t *ip6h = (ip6_t *)mp->b_rptr; 2361 uint_t cksum_offset = 0; 2362 2363 ASSERT(ntohs(ip6h->ip6_plen) + IPV6_HDR_LEN == ixa->ixa_pktlen); 2364 2365 if (ixa->ixa_flags & IXAF_SET_ULP_CKSUM) { 2366 if (connp->conn_proto == IPPROTO_ICMPV6) { 2367 cksum_offset = ixa->ixa_ip_hdr_length + 2368 offsetof(icmp6_t, icmp6_cksum); 2369 } else if (ixa->ixa_flags & IXAF_SET_RAW_CKSUM) { 2370 cksum_offset = ixa->ixa_ip_hdr_length + 2371 ixa->ixa_raw_cksum_offset; 2372 } 2373 } 2374 if (cksum_offset != 0) { 2375 uint16_t *ptr; 2376 2377 /* Make sure the checksum fits in the first mblk */ 2378 if (cksum_offset + sizeof (short) > MBLKL(mp)) { 2379 mblk_t *mp1; 2380 2381 mp1 = msgpullup(mp, 2382 cksum_offset + sizeof (short)); 2383 freemsg(mp); 2384 if (mp1 == NULL) { 2385 *errorp = ENOMEM; 2386 return (NULL); 2387 } 2388 mp = mp1; 2389 ip6h = (ip6_t *)mp->b_rptr; 2390 } 2391 ptr = (uint16_t *)(mp->b_rptr + cksum_offset); 2392 *ptr = htons(cksum); 2393 } 2394 } 2395 2396 /* Note that we don't try to update wroff due to ancillary data */ 2397 return (mp); 2398 } 2399 2400 static int 2401 icmp_build_hdr_template(conn_t *connp, const in6_addr_t *v6src, 2402 const in6_addr_t *v6dst, uint32_t flowinfo) 2403 { 2404 int error; 2405 2406 ASSERT(MUTEX_HELD(&connp->conn_lock)); 2407 /* 2408 * We clear lastdst to make sure we don't use the lastdst path 2409 * next time sending since we might not have set v6dst yet. 2410 */ 2411 connp->conn_v6lastdst = ipv6_all_zeros; 2412 2413 error = conn_build_hdr_template(connp, 0, 0, v6src, v6dst, flowinfo); 2414 if (error != 0) 2415 return (error); 2416 2417 /* 2418 * Any routing header/option has been massaged. The checksum difference 2419 * is stored in conn_sum. 2420 */ 2421 return (0); 2422 } 2423 2424 /* 2425 * This routine retrieves the value of an ND variable in a icmpparam_t 2426 * structure. It is called through nd_getset when a user reads the 2427 * variable. 2428 */ 2429 /* ARGSUSED */ 2430 static int 2431 icmp_param_get(queue_t *q, mblk_t *mp, caddr_t cp, cred_t *cr) 2432 { 2433 icmpparam_t *icmppa = (icmpparam_t *)cp; 2434 2435 (void) mi_mpprintf(mp, "%d", icmppa->icmp_param_value); 2436 return (0); 2437 } 2438 2439 /* 2440 * Walk through the param array specified registering each element with the 2441 * named dispatch (ND) handler. 2442 */ 2443 static boolean_t 2444 icmp_param_register(IDP *ndp, icmpparam_t *icmppa, int cnt) 2445 { 2446 for (; cnt-- > 0; icmppa++) { 2447 if (icmppa->icmp_param_name && icmppa->icmp_param_name[0]) { 2448 if (!nd_load(ndp, icmppa->icmp_param_name, 2449 icmp_param_get, icmp_param_set, 2450 (caddr_t)icmppa)) { 2451 nd_free(ndp); 2452 return (B_FALSE); 2453 } 2454 } 2455 } 2456 return (B_TRUE); 2457 } 2458 2459 /* This routine sets an ND variable in a icmpparam_t structure. */ 2460 /* ARGSUSED */ 2461 static int 2462 icmp_param_set(queue_t *q, mblk_t *mp, char *value, caddr_t cp, cred_t *cr) 2463 { 2464 long new_value; 2465 icmpparam_t *icmppa = (icmpparam_t *)cp; 2466 2467 /* 2468 * Fail the request if the new value does not lie within the 2469 * required bounds. 2470 */ 2471 if (ddi_strtol(value, NULL, 10, &new_value) != 0 || 2472 new_value < icmppa->icmp_param_min || 2473 new_value > icmppa->icmp_param_max) { 2474 return (EINVAL); 2475 } 2476 /* Set the new value */ 2477 icmppa->icmp_param_value = new_value; 2478 return (0); 2479 } 2480 2481 static mblk_t * 2482 icmp_queue_fallback(icmp_t *icmp, mblk_t *mp) 2483 { 2484 ASSERT(MUTEX_HELD(&icmp->icmp_recv_lock)); 2485 if (IPCL_IS_NONSTR(icmp->icmp_connp)) { 2486 /* 2487 * fallback has started but messages have not been moved yet 2488 */ 2489 if (icmp->icmp_fallback_queue_head == NULL) { 2490 ASSERT(icmp->icmp_fallback_queue_tail == NULL); 2491 icmp->icmp_fallback_queue_head = mp; 2492 icmp->icmp_fallback_queue_tail = mp; 2493 } else { 2494 ASSERT(icmp->icmp_fallback_queue_tail != NULL); 2495 icmp->icmp_fallback_queue_tail->b_next = mp; 2496 icmp->icmp_fallback_queue_tail = mp; 2497 } 2498 return (NULL); 2499 } else { 2500 /* 2501 * Fallback completed, let the caller putnext() the mblk. 2502 */ 2503 return (mp); 2504 } 2505 } 2506 2507 /* 2508 * Deliver data to ULP. In case we have a socket, and it's falling back to 2509 * TPI, then we'll queue the mp for later processing. 2510 */ 2511 static void 2512 icmp_ulp_recv(conn_t *connp, mblk_t *mp, uint_t len) 2513 { 2514 if (IPCL_IS_NONSTR(connp)) { 2515 icmp_t *icmp = connp->conn_icmp; 2516 int error; 2517 2518 ASSERT(len == msgdsize(mp)); 2519 if ((*connp->conn_upcalls->su_recv) 2520 (connp->conn_upper_handle, mp, len, 0, &error, NULL) < 0) { 2521 mutex_enter(&icmp->icmp_recv_lock); 2522 if (error == ENOSPC) { 2523 /* 2524 * let's confirm while holding the lock 2525 */ 2526 if ((*connp->conn_upcalls->su_recv) 2527 (connp->conn_upper_handle, NULL, 0, 0, 2528 &error, NULL) < 0) { 2529 ASSERT(error == ENOSPC); 2530 if (error == ENOSPC) { 2531 connp->conn_flow_cntrld = 2532 B_TRUE; 2533 } 2534 } 2535 mutex_exit(&icmp->icmp_recv_lock); 2536 } else { 2537 ASSERT(error == EOPNOTSUPP); 2538 mp = icmp_queue_fallback(icmp, mp); 2539 mutex_exit(&icmp->icmp_recv_lock); 2540 if (mp != NULL) 2541 putnext(connp->conn_rq, mp); 2542 } 2543 } 2544 ASSERT(MUTEX_NOT_HELD(&icmp->icmp_recv_lock)); 2545 } else { 2546 putnext(connp->conn_rq, mp); 2547 } 2548 } 2549 2550 /* 2551 * This is the inbound data path. 2552 * IP has already pulled up the IP headers and verified alignment 2553 * etc. 2554 */ 2555 /* ARGSUSED2 */ 2556 static void 2557 icmp_input(void *arg1, mblk_t *mp, void *arg2, ip_recv_attr_t *ira) 2558 { 2559 conn_t *connp = (conn_t *)arg1; 2560 struct T_unitdata_ind *tudi; 2561 uchar_t *rptr; /* Pointer to IP header */ 2562 int ip_hdr_length; 2563 int udi_size; /* Size of T_unitdata_ind */ 2564 int pkt_len; 2565 icmp_t *icmp; 2566 ip_pkt_t ipps; 2567 ip6_t *ip6h; 2568 mblk_t *mp1; 2569 crb_t recv_ancillary; 2570 icmp_stack_t *is; 2571 sin_t *sin; 2572 sin6_t *sin6; 2573 ipha_t *ipha; 2574 2575 ASSERT(connp->conn_flags & IPCL_RAWIPCONN); 2576 2577 icmp = connp->conn_icmp; 2578 is = icmp->icmp_is; 2579 rptr = mp->b_rptr; 2580 2581 ASSERT(DB_TYPE(mp) == M_DATA); 2582 ASSERT(OK_32PTR(rptr)); 2583 ASSERT(ira->ira_pktlen == msgdsize(mp)); 2584 pkt_len = ira->ira_pktlen; 2585 2586 /* 2587 * Get a snapshot of these and allow other threads to change 2588 * them after that. We need the same recv_ancillary when determining 2589 * the size as when adding the ancillary data items. 2590 */ 2591 mutex_enter(&connp->conn_lock); 2592 recv_ancillary = connp->conn_recv_ancillary; 2593 mutex_exit(&connp->conn_lock); 2594 2595 ip_hdr_length = ira->ira_ip_hdr_length; 2596 ASSERT(MBLKL(mp) >= ip_hdr_length); /* IP did a pullup */ 2597 2598 /* Initialize regardless of IP version */ 2599 ipps.ipp_fields = 0; 2600 2601 if (ira->ira_flags & IRAF_IS_IPV4) { 2602 ASSERT(IPH_HDR_VERSION(rptr) == IPV4_VERSION); 2603 ASSERT(MBLKL(mp) >= sizeof (ipha_t)); 2604 ASSERT(ira->ira_ip_hdr_length == IPH_HDR_LENGTH(rptr)); 2605 2606 ipha = (ipha_t *)mp->b_rptr; 2607 if (recv_ancillary.crb_all != 0) 2608 (void) ip_find_hdr_v4(ipha, &ipps, B_FALSE); 2609 2610 /* 2611 * BSD for some reason adjusts ipha_length to exclude the 2612 * IP header length. We do the same. 2613 */ 2614 if (is->is_bsd_compat) { 2615 ushort_t len; 2616 2617 len = ntohs(ipha->ipha_length); 2618 if (mp->b_datap->db_ref > 1) { 2619 /* 2620 * Allocate a new IP header so that we can 2621 * modify ipha_length. 2622 */ 2623 mblk_t *mp1; 2624 2625 mp1 = allocb(ip_hdr_length, BPRI_MED); 2626 if (mp1 == NULL) { 2627 freemsg(mp); 2628 BUMP_MIB(&is->is_rawip_mib, 2629 rawipInErrors); 2630 return; 2631 } 2632 bcopy(rptr, mp1->b_rptr, ip_hdr_length); 2633 mp->b_rptr = rptr + ip_hdr_length; 2634 rptr = mp1->b_rptr; 2635 ipha = (ipha_t *)rptr; 2636 mp1->b_cont = mp; 2637 mp1->b_wptr = rptr + ip_hdr_length; 2638 mp = mp1; 2639 } 2640 len -= ip_hdr_length; 2641 ipha->ipha_length = htons(len); 2642 } 2643 2644 /* 2645 * For RAW sockets we not pass ICMP/IPv4 packets to AF_INET6 2646 * sockets. This is ensured by icmp_bind and the IP fanout code. 2647 */ 2648 ASSERT(connp->conn_family == AF_INET); 2649 2650 /* 2651 * This is the inbound data path. Packets are passed upstream 2652 * as T_UNITDATA_IND messages with full IPv4 headers still 2653 * attached. 2654 */ 2655 2656 /* 2657 * Normally only send up the source address. 2658 * If any ancillary data items are wanted we add those. 2659 */ 2660 udi_size = sizeof (struct T_unitdata_ind) + sizeof (sin_t); 2661 if (recv_ancillary.crb_all != 0) { 2662 udi_size += conn_recvancillary_size(connp, 2663 recv_ancillary, ira, mp, &ipps); 2664 } 2665 2666 /* Allocate a message block for the T_UNITDATA_IND structure. */ 2667 mp1 = allocb(udi_size, BPRI_MED); 2668 if (mp1 == NULL) { 2669 freemsg(mp); 2670 BUMP_MIB(&is->is_rawip_mib, rawipInErrors); 2671 return; 2672 } 2673 mp1->b_cont = mp; 2674 tudi = (struct T_unitdata_ind *)mp1->b_rptr; 2675 mp1->b_datap->db_type = M_PROTO; 2676 mp1->b_wptr = (uchar_t *)tudi + udi_size; 2677 tudi->PRIM_type = T_UNITDATA_IND; 2678 tudi->SRC_length = sizeof (sin_t); 2679 tudi->SRC_offset = sizeof (struct T_unitdata_ind); 2680 sin = (sin_t *)&tudi[1]; 2681 *sin = sin_null; 2682 sin->sin_family = AF_INET; 2683 sin->sin_addr.s_addr = ipha->ipha_src; 2684 *(uint32_t *)&sin->sin_zero[0] = 0; 2685 *(uint32_t *)&sin->sin_zero[4] = 0; 2686 tudi->OPT_offset = sizeof (struct T_unitdata_ind) + 2687 sizeof (sin_t); 2688 udi_size -= (sizeof (struct T_unitdata_ind) + sizeof (sin_t)); 2689 tudi->OPT_length = udi_size; 2690 2691 /* 2692 * Add options if IP_RECVIF etc is set 2693 */ 2694 if (udi_size != 0) { 2695 conn_recvancillary_add(connp, recv_ancillary, ira, 2696 &ipps, (uchar_t *)&sin[1], udi_size); 2697 } 2698 goto deliver; 2699 } 2700 2701 ASSERT(IPH_HDR_VERSION(rptr) == IPV6_VERSION); 2702 /* 2703 * IPv6 packets can only be received by applications 2704 * that are prepared to receive IPv6 addresses. 2705 * The IP fanout must ensure this. 2706 */ 2707 ASSERT(connp->conn_family == AF_INET6); 2708 2709 /* 2710 * Handle IPv6 packets. We don't pass up the IP headers with the 2711 * payload for IPv6. 2712 */ 2713 2714 ip6h = (ip6_t *)rptr; 2715 if (recv_ancillary.crb_all != 0) { 2716 /* 2717 * Call on ip_find_hdr_v6 which gets individual lenghts of 2718 * extension headers (and pointers to them). 2719 */ 2720 uint8_t nexthdr; 2721 2722 /* We don't care about the length or nextheader. */ 2723 (void) ip_find_hdr_v6(mp, ip6h, B_TRUE, &ipps, &nexthdr); 2724 2725 /* 2726 * We do not pass up hop-by-hop options or any other 2727 * extension header as part of the packet. Applications 2728 * that want to see them have to specify IPV6_RECV* socket 2729 * options. And conn_recvancillary_size/add explicitly 2730 * drops the TX option from IPV6_HOPOPTS as it does for UDP. 2731 * 2732 * If we had multilevel ICMP sockets, then we'd want to 2733 * modify conn_recvancillary_size/add to 2734 * allow the user to see the label. 2735 */ 2736 } 2737 2738 /* 2739 * Check a filter for ICMPv6 types if needed. 2740 * Verify raw checksums if needed. 2741 */ 2742 mutex_enter(&connp->conn_lock); 2743 if (icmp->icmp_filter != NULL) { 2744 int type; 2745 2746 /* Assumes that IP has done the pullupmsg */ 2747 type = mp->b_rptr[ip_hdr_length]; 2748 2749 ASSERT(mp->b_rptr + ip_hdr_length <= mp->b_wptr); 2750 if (ICMP6_FILTER_WILLBLOCK(type, icmp->icmp_filter)) { 2751 mutex_exit(&connp->conn_lock); 2752 freemsg(mp); 2753 return; 2754 } 2755 } 2756 if (connp->conn_ixa->ixa_flags & IXAF_SET_RAW_CKSUM) { 2757 /* Checksum */ 2758 uint16_t *up; 2759 uint32_t sum; 2760 int remlen; 2761 2762 up = (uint16_t *)&ip6h->ip6_src; 2763 2764 remlen = msgdsize(mp) - ip_hdr_length; 2765 sum = htons(connp->conn_proto + remlen) 2766 + up[0] + up[1] + up[2] + up[3] 2767 + up[4] + up[5] + up[6] + up[7] 2768 + up[8] + up[9] + up[10] + up[11] 2769 + up[12] + up[13] + up[14] + up[15]; 2770 sum = (sum & 0xffff) + (sum >> 16); 2771 sum = IP_CSUM(mp, ip_hdr_length, sum); 2772 if (sum != 0) { 2773 /* IPv6 RAW checksum failed */ 2774 ip0dbg(("icmp_rput: RAW checksum failed %x\n", sum)); 2775 mutex_exit(&connp->conn_lock); 2776 freemsg(mp); 2777 BUMP_MIB(&is->is_rawip_mib, rawipInCksumErrs); 2778 return; 2779 } 2780 } 2781 mutex_exit(&connp->conn_lock); 2782 2783 udi_size = sizeof (struct T_unitdata_ind) + sizeof (sin6_t); 2784 2785 if (recv_ancillary.crb_all != 0) { 2786 udi_size += conn_recvancillary_size(connp, 2787 recv_ancillary, ira, mp, &ipps); 2788 } 2789 2790 mp1 = allocb(udi_size, BPRI_MED); 2791 if (mp1 == NULL) { 2792 freemsg(mp); 2793 BUMP_MIB(&is->is_rawip_mib, rawipInErrors); 2794 return; 2795 } 2796 mp1->b_cont = mp; 2797 mp1->b_datap->db_type = M_PROTO; 2798 tudi = (struct T_unitdata_ind *)mp1->b_rptr; 2799 mp1->b_wptr = (uchar_t *)tudi + udi_size; 2800 tudi->PRIM_type = T_UNITDATA_IND; 2801 tudi->SRC_length = sizeof (sin6_t); 2802 tudi->SRC_offset = sizeof (struct T_unitdata_ind); 2803 tudi->OPT_offset = sizeof (struct T_unitdata_ind) + sizeof (sin6_t); 2804 udi_size -= (sizeof (struct T_unitdata_ind) + sizeof (sin6_t)); 2805 tudi->OPT_length = udi_size; 2806 sin6 = (sin6_t *)&tudi[1]; 2807 *sin6 = sin6_null; 2808 sin6->sin6_port = 0; 2809 sin6->sin6_family = AF_INET6; 2810 2811 sin6->sin6_addr = ip6h->ip6_src; 2812 /* No sin6_flowinfo per API */ 2813 sin6->sin6_flowinfo = 0; 2814 /* For link-scope pass up scope id */ 2815 if (IN6_IS_ADDR_LINKSCOPE(&ip6h->ip6_src)) 2816 sin6->sin6_scope_id = ira->ira_ruifindex; 2817 else 2818 sin6->sin6_scope_id = 0; 2819 sin6->__sin6_src_id = ip_srcid_find_addr(&ip6h->ip6_dst, 2820 IPCL_ZONEID(connp), is->is_netstack); 2821 2822 if (udi_size != 0) { 2823 conn_recvancillary_add(connp, recv_ancillary, ira, 2824 &ipps, (uchar_t *)&sin6[1], udi_size); 2825 } 2826 2827 /* Skip all the IPv6 headers per API */ 2828 mp->b_rptr += ip_hdr_length; 2829 pkt_len -= ip_hdr_length; 2830 2831 deliver: 2832 BUMP_MIB(&is->is_rawip_mib, rawipInDatagrams); 2833 icmp_ulp_recv(connp, mp1, pkt_len); 2834 } 2835 2836 /* 2837 * return SNMP stuff in buffer in mpdata. We don't hold any lock and report 2838 * information that can be changing beneath us. 2839 */ 2840 mblk_t * 2841 icmp_snmp_get(queue_t *q, mblk_t *mpctl) 2842 { 2843 mblk_t *mpdata; 2844 struct opthdr *optp; 2845 conn_t *connp = Q_TO_CONN(q); 2846 icmp_stack_t *is = connp->conn_netstack->netstack_icmp; 2847 mblk_t *mp2ctl; 2848 2849 /* 2850 * make a copy of the original message 2851 */ 2852 mp2ctl = copymsg(mpctl); 2853 2854 if (mpctl == NULL || 2855 (mpdata = mpctl->b_cont) == NULL) { 2856 freemsg(mpctl); 2857 freemsg(mp2ctl); 2858 return (0); 2859 } 2860 2861 /* fixed length structure for IPv4 and IPv6 counters */ 2862 optp = (struct opthdr *)&mpctl->b_rptr[sizeof (struct T_optmgmt_ack)]; 2863 optp->level = EXPER_RAWIP; 2864 optp->name = 0; 2865 (void) snmp_append_data(mpdata, (char *)&is->is_rawip_mib, 2866 sizeof (is->is_rawip_mib)); 2867 optp->len = msgdsize(mpdata); 2868 qreply(q, mpctl); 2869 2870 return (mp2ctl); 2871 } 2872 2873 /* 2874 * Return 0 if invalid set request, 1 otherwise, including non-rawip requests. 2875 * TODO: If this ever actually tries to set anything, it needs to be 2876 * to do the appropriate locking. 2877 */ 2878 /* ARGSUSED */ 2879 int 2880 icmp_snmp_set(queue_t *q, t_scalar_t level, t_scalar_t name, 2881 uchar_t *ptr, int len) 2882 { 2883 switch (level) { 2884 case EXPER_RAWIP: 2885 return (0); 2886 default: 2887 return (1); 2888 } 2889 } 2890 2891 /* 2892 * This routine creates a T_UDERROR_IND message and passes it upstream. 2893 * The address and options are copied from the T_UNITDATA_REQ message 2894 * passed in mp. This message is freed. 2895 */ 2896 static void 2897 icmp_ud_err(queue_t *q, mblk_t *mp, t_scalar_t err) 2898 { 2899 struct T_unitdata_req *tudr; 2900 mblk_t *mp1; 2901 uchar_t *destaddr; 2902 t_scalar_t destlen; 2903 uchar_t *optaddr; 2904 t_scalar_t optlen; 2905 2906 if ((mp->b_wptr < mp->b_rptr) || 2907 (MBLKL(mp)) < sizeof (struct T_unitdata_req)) { 2908 goto done; 2909 } 2910 tudr = (struct T_unitdata_req *)mp->b_rptr; 2911 destaddr = mp->b_rptr + tudr->DEST_offset; 2912 if (destaddr < mp->b_rptr || destaddr >= mp->b_wptr || 2913 destaddr + tudr->DEST_length < mp->b_rptr || 2914 destaddr + tudr->DEST_length > mp->b_wptr) { 2915 goto done; 2916 } 2917 optaddr = mp->b_rptr + tudr->OPT_offset; 2918 if (optaddr < mp->b_rptr || optaddr >= mp->b_wptr || 2919 optaddr + tudr->OPT_length < mp->b_rptr || 2920 optaddr + tudr->OPT_length > mp->b_wptr) { 2921 goto done; 2922 } 2923 destlen = tudr->DEST_length; 2924 optlen = tudr->OPT_length; 2925 2926 mp1 = mi_tpi_uderror_ind((char *)destaddr, destlen, 2927 (char *)optaddr, optlen, err); 2928 if (mp1 != NULL) 2929 qreply(q, mp1); 2930 2931 done: 2932 freemsg(mp); 2933 } 2934 2935 static int 2936 rawip_do_unbind(conn_t *connp) 2937 { 2938 icmp_t *icmp = connp->conn_icmp; 2939 2940 mutex_enter(&connp->conn_lock); 2941 /* If a bind has not been done, we can't unbind. */ 2942 if (icmp->icmp_state == TS_UNBND) { 2943 mutex_exit(&connp->conn_lock); 2944 return (-TOUTSTATE); 2945 } 2946 connp->conn_saddr_v6 = ipv6_all_zeros; 2947 connp->conn_bound_addr_v6 = ipv6_all_zeros; 2948 connp->conn_laddr_v6 = ipv6_all_zeros; 2949 connp->conn_mcbc_bind = B_FALSE; 2950 connp->conn_lport = 0; 2951 connp->conn_fport = 0; 2952 /* In case we were also connected */ 2953 connp->conn_faddr_v6 = ipv6_all_zeros; 2954 connp->conn_v6lastdst = ipv6_all_zeros; 2955 2956 icmp->icmp_state = TS_UNBND; 2957 2958 (void) icmp_build_hdr_template(connp, &connp->conn_saddr_v6, 2959 &connp->conn_faddr_v6, connp->conn_flowinfo); 2960 mutex_exit(&connp->conn_lock); 2961 2962 ip_unbind(connp); 2963 return (0); 2964 } 2965 2966 /* 2967 * This routine is called by icmp_wput to handle T_UNBIND_REQ messages. 2968 * After some error checking, the message is passed downstream to ip. 2969 */ 2970 static void 2971 icmp_tpi_unbind(queue_t *q, mblk_t *mp) 2972 { 2973 conn_t *connp = Q_TO_CONN(q); 2974 int error; 2975 2976 ASSERT(mp->b_cont == NULL); 2977 error = rawip_do_unbind(connp); 2978 if (error) { 2979 if (error < 0) { 2980 icmp_err_ack(q, mp, -error, 0); 2981 } else { 2982 icmp_err_ack(q, mp, 0, error); 2983 } 2984 return; 2985 } 2986 2987 /* 2988 * Convert mp into a T_OK_ACK 2989 */ 2990 2991 mp = mi_tpi_ok_ack_alloc(mp); 2992 2993 /* 2994 * should not happen in practice... T_OK_ACK is smaller than the 2995 * original message. 2996 */ 2997 ASSERT(mp != NULL); 2998 ASSERT(((struct T_ok_ack *)mp->b_rptr)->PRIM_type == T_OK_ACK); 2999 qreply(q, mp); 3000 } 3001 3002 /* 3003 * Process IPv4 packets that already include an IP header. 3004 * Used when IP_HDRINCL has been set (implicit for IPPROTO_RAW and 3005 * IPPROTO_IGMP). 3006 * In this case we ignore the address and any options in the T_UNITDATA_REQ. 3007 * 3008 * The packet is assumed to have a base (20 byte) IP header followed 3009 * by the upper-layer protocol. We include any IP_OPTIONS including a 3010 * CIPSO label but otherwise preserve the base IP header. 3011 */ 3012 static int 3013 icmp_output_hdrincl(conn_t *connp, mblk_t *mp, cred_t *cr, pid_t pid) 3014 { 3015 icmp_t *icmp = connp->conn_icmp; 3016 icmp_stack_t *is = icmp->icmp_is; 3017 ipha_t iphas; 3018 ipha_t *ipha; 3019 int ip_hdr_length; 3020 int tp_hdr_len; 3021 ip_xmit_attr_t *ixa; 3022 ip_pkt_t *ipp; 3023 in6_addr_t v6src; 3024 in6_addr_t v6dst; 3025 in6_addr_t v6nexthop; 3026 int error; 3027 boolean_t do_ipsec; 3028 3029 /* 3030 * We need an exclusive copy of conn_ixa since the included IP 3031 * header could have any destination. 3032 * That copy has no pointers hence we 3033 * need to set them up once we've parsed the ancillary data. 3034 */ 3035 ixa = conn_get_ixa_exclusive(connp); 3036 if (ixa == NULL) { 3037 BUMP_MIB(&is->is_rawip_mib, rawipOutErrors); 3038 freemsg(mp); 3039 return (ENOMEM); 3040 } 3041 ASSERT(cr != NULL); 3042 /* 3043 * Caller has a reference on cr; from db_credp or because we 3044 * are running in process context. 3045 */ 3046 ixa->ixa_cred = cr; 3047 ixa->ixa_cpid = pid; 3048 if (is_system_labeled()) { 3049 /* We need to restart with a label based on the cred */ 3050 ip_xmit_attr_restore_tsl(ixa, ixa->ixa_cred); 3051 } 3052 3053 /* In case previous destination was multicast or multirt */ 3054 ip_attr_newdst(ixa); 3055 3056 /* Get a copy of conn_xmit_ipp since the TX label might change it */ 3057 ipp = kmem_zalloc(sizeof (*ipp), KM_NOSLEEP); 3058 if (ipp == NULL) { 3059 ixa->ixa_cred = connp->conn_cred; /* Restore */ 3060 ixa->ixa_cpid = connp->conn_cpid; 3061 ixa_refrele(ixa); 3062 BUMP_MIB(&is->is_rawip_mib, rawipOutErrors); 3063 freemsg(mp); 3064 return (ENOMEM); 3065 } 3066 mutex_enter(&connp->conn_lock); 3067 error = ip_pkt_copy(&connp->conn_xmit_ipp, ipp, KM_NOSLEEP); 3068 mutex_exit(&connp->conn_lock); 3069 if (error != 0) { 3070 BUMP_MIB(&is->is_rawip_mib, rawipOutErrors); 3071 freemsg(mp); 3072 goto done; 3073 } 3074 3075 /* Sanity check length of packet */ 3076 ipha = (ipha_t *)mp->b_rptr; 3077 3078 ip_hdr_length = IP_SIMPLE_HDR_LENGTH; 3079 if ((mp->b_wptr - mp->b_rptr) < IP_SIMPLE_HDR_LENGTH) { 3080 if (!pullupmsg(mp, IP_SIMPLE_HDR_LENGTH)) { 3081 BUMP_MIB(&is->is_rawip_mib, rawipOutErrors); 3082 freemsg(mp); 3083 goto done; 3084 } 3085 ipha = (ipha_t *)mp->b_rptr; 3086 } 3087 ipha->ipha_version_and_hdr_length = 3088 (IP_VERSION<<4) | (ip_hdr_length>>2); 3089 3090 /* 3091 * We set IXAF_DONTFRAG if the application set DF which makes 3092 * IP not fragment. 3093 */ 3094 ipha->ipha_fragment_offset_and_flags &= htons(IPH_DF); 3095 if (ipha->ipha_fragment_offset_and_flags & htons(IPH_DF)) 3096 ixa->ixa_flags |= (IXAF_DONTFRAG | IXAF_PMTU_IPV4_DF); 3097 else 3098 ixa->ixa_flags &= ~(IXAF_DONTFRAG | IXAF_PMTU_IPV4_DF); 3099 3100 /* Even for multicast and broadcast we honor the apps ttl */ 3101 ixa->ixa_flags |= IXAF_NO_TTL_CHANGE; 3102 3103 /* 3104 * No source verification for non-local addresses 3105 */ 3106 if (ipha->ipha_src != INADDR_ANY && 3107 ip_laddr_verify_v4(ipha->ipha_src, ixa->ixa_zoneid, 3108 is->is_netstack->netstack_ip, B_FALSE) 3109 != IPVL_UNICAST_UP) { 3110 ixa->ixa_flags &= ~IXAF_VERIFY_SOURCE; 3111 } 3112 3113 if (ipha->ipha_dst == INADDR_ANY) 3114 ipha->ipha_dst = htonl(INADDR_LOOPBACK); 3115 3116 IN6_IPADDR_TO_V4MAPPED(ipha->ipha_src, &v6src); 3117 IN6_IPADDR_TO_V4MAPPED(ipha->ipha_dst, &v6dst); 3118 3119 /* Defer IPsec if it might need to look at ICMP type/code */ 3120 do_ipsec = ipha->ipha_protocol != IPPROTO_ICMP; 3121 ixa->ixa_flags |= IXAF_IS_IPV4; 3122 3123 ip_attr_nexthop(ipp, ixa, &v6dst, &v6nexthop); 3124 error = ip_attr_connect(connp, ixa, &v6src, &v6dst, &v6nexthop, 3125 connp->conn_fport, &v6src, NULL, IPDF_ALLOW_MCBC | IPDF_VERIFY_DST | 3126 (do_ipsec ? IPDF_IPSEC : 0)); 3127 switch (error) { 3128 case 0: 3129 break; 3130 case EADDRNOTAVAIL: 3131 /* 3132 * IXAF_VERIFY_SOURCE tells us to pick a better source. 3133 * Don't have the application see that errno 3134 */ 3135 error = ENETUNREACH; 3136 goto failed; 3137 case ENETDOWN: 3138 /* 3139 * Have !ipif_addr_ready address; drop packet silently 3140 * until we can get applications to not send until we 3141 * are ready. 3142 */ 3143 error = 0; 3144 goto failed; 3145 case EHOSTUNREACH: 3146 case ENETUNREACH: 3147 if (ixa->ixa_ire != NULL) { 3148 /* 3149 * Let conn_ip_output/ire_send_noroute return 3150 * the error and send any local ICMP error. 3151 */ 3152 error = 0; 3153 break; 3154 } 3155 /* FALLTHRU */ 3156 default: 3157 failed: 3158 freemsg(mp); 3159 BUMP_MIB(&is->is_rawip_mib, rawipOutErrors); 3160 goto done; 3161 } 3162 if (ipha->ipha_src == INADDR_ANY) 3163 IN6_V4MAPPED_TO_IPADDR(&v6src, ipha->ipha_src); 3164 3165 /* 3166 * We might be going to a different destination than last time, 3167 * thus check that TX allows the communication and compute any 3168 * needed label. 3169 * 3170 * TSOL Note: We have an exclusive ipp and ixa for this thread so we 3171 * don't have to worry about concurrent threads. 3172 */ 3173 if (is_system_labeled()) { 3174 /* 3175 * Check whether Trusted Solaris policy allows communication 3176 * with this host, and pretend that the destination is 3177 * unreachable if not. 3178 * Compute any needed label and place it in ipp_label_v4/v6. 3179 * 3180 * Later conn_build_hdr_template/conn_prepend_hdr takes 3181 * ipp_label_v4/v6 to form the packet. 3182 * 3183 * Tsol note: We have ipp structure local to this thread so 3184 * no locking is needed. 3185 */ 3186 error = conn_update_label(connp, ixa, &v6dst, ipp); 3187 if (error != 0) { 3188 freemsg(mp); 3189 BUMP_MIB(&is->is_rawip_mib, rawipOutErrors); 3190 goto done; 3191 } 3192 } 3193 3194 /* 3195 * Save away a copy of the IPv4 header the application passed down 3196 * and then prepend an IPv4 header complete with any IP options 3197 * including label. 3198 * We need a struct copy since icmp_prepend_hdr will reuse the available 3199 * space in the mblk. 3200 */ 3201 iphas = *ipha; 3202 mp->b_rptr += IP_SIMPLE_HDR_LENGTH; 3203 3204 mp = icmp_prepend_hdr(connp, ixa, ipp, &v6src, &v6dst, 0, mp, &error); 3205 if (mp == NULL) { 3206 BUMP_MIB(&is->is_rawip_mib, rawipOutErrors); 3207 ASSERT(error != 0); 3208 goto done; 3209 } 3210 if (ixa->ixa_pktlen > IP_MAXPACKET) { 3211 error = EMSGSIZE; 3212 BUMP_MIB(&is->is_rawip_mib, rawipOutErrors); 3213 freemsg(mp); 3214 goto done; 3215 } 3216 /* Restore key parts of the header that the application passed down */ 3217 ipha = (ipha_t *)mp->b_rptr; 3218 ipha->ipha_type_of_service = iphas.ipha_type_of_service; 3219 ipha->ipha_ident = iphas.ipha_ident; 3220 ipha->ipha_fragment_offset_and_flags = 3221 iphas.ipha_fragment_offset_and_flags; 3222 ipha->ipha_ttl = iphas.ipha_ttl; 3223 ipha->ipha_protocol = iphas.ipha_protocol; 3224 ipha->ipha_src = iphas.ipha_src; 3225 ipha->ipha_dst = iphas.ipha_dst; 3226 3227 ixa->ixa_protocol = ipha->ipha_protocol; 3228 3229 /* 3230 * Make sure that the IP header plus any transport header that is 3231 * checksumed by ip_output is in the first mblk. (ip_output assumes 3232 * that at least the checksum field is in the first mblk.) 3233 */ 3234 switch (ipha->ipha_protocol) { 3235 case IPPROTO_UDP: 3236 tp_hdr_len = 8; 3237 break; 3238 case IPPROTO_TCP: 3239 tp_hdr_len = 20; 3240 break; 3241 default: 3242 tp_hdr_len = 0; 3243 break; 3244 } 3245 ip_hdr_length = IPH_HDR_LENGTH(ipha); 3246 if (mp->b_wptr - mp->b_rptr < ip_hdr_length + tp_hdr_len) { 3247 if (!pullupmsg(mp, ip_hdr_length + tp_hdr_len)) { 3248 BUMP_MIB(&is->is_rawip_mib, rawipOutErrors); 3249 if (mp->b_cont == NULL) 3250 error = EINVAL; 3251 else 3252 error = ENOMEM; 3253 freemsg(mp); 3254 goto done; 3255 } 3256 } 3257 3258 if (!do_ipsec) { 3259 /* Policy might differ for different ICMP type/code */ 3260 if (ixa->ixa_ipsec_policy != NULL) { 3261 IPPOL_REFRELE(ixa->ixa_ipsec_policy); 3262 ixa->ixa_ipsec_policy = NULL; 3263 ixa->ixa_flags &= ~IXAF_IPSEC_SECURE; 3264 } 3265 mp = ip_output_attach_policy(mp, ipha, NULL, connp, ixa); 3266 if (mp == NULL) { 3267 BUMP_MIB(&is->is_rawip_mib, rawipOutErrors); 3268 error = EHOSTUNREACH; /* IPsec policy failure */ 3269 goto done; 3270 } 3271 } 3272 3273 /* We're done. Pass the packet to ip. */ 3274 BUMP_MIB(&is->is_rawip_mib, rawipOutDatagrams); 3275 3276 error = conn_ip_output(mp, ixa); 3277 /* No rawipOutErrors if an error since IP increases its error counter */ 3278 switch (error) { 3279 case 0: 3280 break; 3281 case EWOULDBLOCK: 3282 (void) ixa_check_drain_insert(connp, ixa); 3283 error = 0; 3284 break; 3285 case EADDRNOTAVAIL: 3286 /* 3287 * IXAF_VERIFY_SOURCE tells us to pick a better source. 3288 * Don't have the application see that errno 3289 */ 3290 error = ENETUNREACH; 3291 break; 3292 } 3293 done: 3294 ixa->ixa_cred = connp->conn_cred; /* Restore */ 3295 ixa->ixa_cpid = connp->conn_cpid; 3296 ixa_refrele(ixa); 3297 ip_pkt_free(ipp); 3298 kmem_free(ipp, sizeof (*ipp)); 3299 return (error); 3300 } 3301 3302 static mblk_t * 3303 icmp_output_attach_policy(mblk_t *mp, conn_t *connp, ip_xmit_attr_t *ixa) 3304 { 3305 ipha_t *ipha = NULL; 3306 ip6_t *ip6h = NULL; 3307 3308 if (ixa->ixa_flags & IXAF_IS_IPV4) 3309 ipha = (ipha_t *)mp->b_rptr; 3310 else 3311 ip6h = (ip6_t *)mp->b_rptr; 3312 3313 if (ixa->ixa_ipsec_policy != NULL) { 3314 IPPOL_REFRELE(ixa->ixa_ipsec_policy); 3315 ixa->ixa_ipsec_policy = NULL; 3316 ixa->ixa_flags &= ~IXAF_IPSEC_SECURE; 3317 } 3318 return (ip_output_attach_policy(mp, ipha, ip6h, connp, ixa)); 3319 } 3320 3321 /* 3322 * Handle T_UNITDATA_REQ with options. Both IPv4 and IPv6 3323 * Either tudr_mp or msg is set. If tudr_mp we take ancillary data from 3324 * the TPI options, otherwise we take them from msg_control. 3325 * If both sin and sin6 is set it is a connected socket and we use conn_faddr. 3326 * Always consumes mp; never consumes tudr_mp. 3327 */ 3328 static int 3329 icmp_output_ancillary(conn_t *connp, sin_t *sin, sin6_t *sin6, mblk_t *mp, 3330 mblk_t *tudr_mp, struct nmsghdr *msg, cred_t *cr, pid_t pid) 3331 { 3332 icmp_t *icmp = connp->conn_icmp; 3333 icmp_stack_t *is = icmp->icmp_is; 3334 int error; 3335 ip_xmit_attr_t *ixa; 3336 ip_pkt_t *ipp; 3337 in6_addr_t v6src; 3338 in6_addr_t v6dst; 3339 in6_addr_t v6nexthop; 3340 in_port_t dstport; 3341 uint32_t flowinfo; 3342 uint_t srcid; 3343 int is_absreq_failure = 0; 3344 conn_opt_arg_t coas, *coa; 3345 3346 ASSERT(tudr_mp != NULL || msg != NULL); 3347 3348 /* 3349 * Get ixa before checking state to handle a disconnect race. 3350 * 3351 * We need an exclusive copy of conn_ixa since the ancillary data 3352 * options might modify it. That copy has no pointers hence we 3353 * need to set them up once we've parsed the ancillary data. 3354 */ 3355 ixa = conn_get_ixa_exclusive(connp); 3356 if (ixa == NULL) { 3357 BUMP_MIB(&is->is_rawip_mib, rawipOutErrors); 3358 freemsg(mp); 3359 return (ENOMEM); 3360 } 3361 ASSERT(cr != NULL); 3362 ixa->ixa_cred = cr; 3363 ixa->ixa_cpid = pid; 3364 if (is_system_labeled()) { 3365 /* We need to restart with a label based on the cred */ 3366 ip_xmit_attr_restore_tsl(ixa, ixa->ixa_cred); 3367 } 3368 3369 /* In case previous destination was multicast or multirt */ 3370 ip_attr_newdst(ixa); 3371 3372 /* Get a copy of conn_xmit_ipp since the options might change it */ 3373 ipp = kmem_zalloc(sizeof (*ipp), KM_NOSLEEP); 3374 if (ipp == NULL) { 3375 ixa->ixa_cred = connp->conn_cred; /* Restore */ 3376 ixa->ixa_cpid = connp->conn_cpid; 3377 ixa_refrele(ixa); 3378 BUMP_MIB(&is->is_rawip_mib, rawipOutErrors); 3379 freemsg(mp); 3380 return (ENOMEM); 3381 } 3382 mutex_enter(&connp->conn_lock); 3383 error = ip_pkt_copy(&connp->conn_xmit_ipp, ipp, KM_NOSLEEP); 3384 mutex_exit(&connp->conn_lock); 3385 if (error != 0) { 3386 BUMP_MIB(&is->is_rawip_mib, rawipOutErrors); 3387 freemsg(mp); 3388 goto done; 3389 } 3390 3391 /* 3392 * Parse the options and update ixa and ipp as a result. 3393 */ 3394 3395 coa = &coas; 3396 coa->coa_connp = connp; 3397 coa->coa_ixa = ixa; 3398 coa->coa_ipp = ipp; 3399 coa->coa_ancillary = B_TRUE; 3400 coa->coa_changed = 0; 3401 3402 if (msg != NULL) { 3403 error = process_auxiliary_options(connp, msg->msg_control, 3404 msg->msg_controllen, coa, &icmp_opt_obj, icmp_opt_set, cr); 3405 } else { 3406 struct T_unitdata_req *tudr; 3407 3408 tudr = (struct T_unitdata_req *)tudr_mp->b_rptr; 3409 ASSERT(tudr->PRIM_type == T_UNITDATA_REQ); 3410 error = tpi_optcom_buf(connp->conn_wq, tudr_mp, 3411 &tudr->OPT_length, tudr->OPT_offset, cr, &icmp_opt_obj, 3412 coa, &is_absreq_failure); 3413 } 3414 if (error != 0) { 3415 /* 3416 * Note: No special action needed in this 3417 * module for "is_absreq_failure" 3418 */ 3419 freemsg(mp); 3420 BUMP_MIB(&is->is_rawip_mib, rawipOutErrors); 3421 goto done; 3422 } 3423 ASSERT(is_absreq_failure == 0); 3424 3425 mutex_enter(&connp->conn_lock); 3426 /* 3427 * If laddr is unspecified then we look at sin6_src_id. 3428 * We will give precedence to a source address set with IPV6_PKTINFO 3429 * (aka IPPF_ADDR) but that is handled in build_hdrs. However, we don't 3430 * want ip_attr_connect to select a source (since it can fail) when 3431 * IPV6_PKTINFO is specified. 3432 * If this doesn't result in a source address then we get a source 3433 * from ip_attr_connect() below. 3434 */ 3435 v6src = connp->conn_saddr_v6; 3436 if (sin != NULL) { 3437 IN6_IPADDR_TO_V4MAPPED(sin->sin_addr.s_addr, &v6dst); 3438 dstport = sin->sin_port; 3439 flowinfo = 0; 3440 ixa->ixa_flags &= ~IXAF_SCOPEID_SET; 3441 ixa->ixa_flags |= IXAF_IS_IPV4; 3442 } else if (sin6 != NULL) { 3443 v6dst = sin6->sin6_addr; 3444 dstport = sin6->sin6_port; 3445 flowinfo = sin6->sin6_flowinfo; 3446 srcid = sin6->__sin6_src_id; 3447 if (IN6_IS_ADDR_LINKSCOPE(&v6dst) && sin6->sin6_scope_id != 0) { 3448 ixa->ixa_scopeid = sin6->sin6_scope_id; 3449 ixa->ixa_flags |= IXAF_SCOPEID_SET; 3450 } else { 3451 ixa->ixa_flags &= ~IXAF_SCOPEID_SET; 3452 } 3453 if (srcid != 0 && IN6_IS_ADDR_UNSPECIFIED(&v6src)) { 3454 ip_srcid_find_id(srcid, &v6src, IPCL_ZONEID(connp), 3455 connp->conn_netstack); 3456 } 3457 if (IN6_IS_ADDR_V4MAPPED(&v6dst)) 3458 ixa->ixa_flags |= IXAF_IS_IPV4; 3459 else 3460 ixa->ixa_flags &= ~IXAF_IS_IPV4; 3461 } else { 3462 /* Connected case */ 3463 v6dst = connp->conn_faddr_v6; 3464 flowinfo = connp->conn_flowinfo; 3465 } 3466 mutex_exit(&connp->conn_lock); 3467 /* Handle IPV6_PKTINFO setting source address. */ 3468 if (IN6_IS_ADDR_UNSPECIFIED(&v6src) && 3469 (ipp->ipp_fields & IPPF_ADDR)) { 3470 if (ixa->ixa_flags & IXAF_IS_IPV4) { 3471 if (IN6_IS_ADDR_V4MAPPED(&ipp->ipp_addr)) 3472 v6src = ipp->ipp_addr; 3473 } else { 3474 if (!IN6_IS_ADDR_V4MAPPED(&ipp->ipp_addr)) 3475 v6src = ipp->ipp_addr; 3476 } 3477 } 3478 /* 3479 * Allow source not assigned to the system 3480 * only if it is not a local addresses 3481 */ 3482 if (!V6_OR_V4_INADDR_ANY(v6src)) { 3483 ip_laddr_t laddr_type; 3484 3485 if (ixa->ixa_flags & IXAF_IS_IPV4) { 3486 ipaddr_t v4src; 3487 3488 IN6_V4MAPPED_TO_IPADDR(&v6src, v4src); 3489 laddr_type = ip_laddr_verify_v4(v4src, ixa->ixa_zoneid, 3490 is->is_netstack->netstack_ip, B_FALSE); 3491 } else { 3492 laddr_type = ip_laddr_verify_v6(&v6src, ixa->ixa_zoneid, 3493 is->is_netstack->netstack_ip, B_FALSE, B_FALSE); 3494 } 3495 if (laddr_type != IPVL_UNICAST_UP) 3496 ixa->ixa_flags &= ~IXAF_VERIFY_SOURCE; 3497 } 3498 3499 ip_attr_nexthop(ipp, ixa, &v6dst, &v6nexthop); 3500 error = ip_attr_connect(connp, ixa, &v6src, &v6dst, &v6nexthop, dstport, 3501 &v6src, NULL, IPDF_ALLOW_MCBC | IPDF_VERIFY_DST); 3502 3503 switch (error) { 3504 case 0: 3505 break; 3506 case EADDRNOTAVAIL: 3507 /* 3508 * IXAF_VERIFY_SOURCE tells us to pick a better source. 3509 * Don't have the application see that errno 3510 */ 3511 error = ENETUNREACH; 3512 goto failed; 3513 case ENETDOWN: 3514 /* 3515 * Have !ipif_addr_ready address; drop packet silently 3516 * until we can get applications to not send until we 3517 * are ready. 3518 */ 3519 error = 0; 3520 goto failed; 3521 case EHOSTUNREACH: 3522 case ENETUNREACH: 3523 if (ixa->ixa_ire != NULL) { 3524 /* 3525 * Let conn_ip_output/ire_send_noroute return 3526 * the error and send any local ICMP error. 3527 */ 3528 error = 0; 3529 break; 3530 } 3531 /* FALLTHRU */ 3532 default: 3533 failed: 3534 freemsg(mp); 3535 BUMP_MIB(&is->is_rawip_mib, rawipOutErrors); 3536 goto done; 3537 } 3538 3539 /* 3540 * We might be going to a different destination than last time, 3541 * thus check that TX allows the communication and compute any 3542 * needed label. 3543 * 3544 * TSOL Note: We have an exclusive ipp and ixa for this thread so we 3545 * don't have to worry about concurrent threads. 3546 */ 3547 if (is_system_labeled()) { 3548 /* 3549 * Check whether Trusted Solaris policy allows communication 3550 * with this host, and pretend that the destination is 3551 * unreachable if not. 3552 * Compute any needed label and place it in ipp_label_v4/v6. 3553 * 3554 * Later conn_build_hdr_template/conn_prepend_hdr takes 3555 * ipp_label_v4/v6 to form the packet. 3556 * 3557 * Tsol note: We have ipp structure local to this thread so 3558 * no locking is needed. 3559 */ 3560 error = conn_update_label(connp, ixa, &v6dst, ipp); 3561 if (error != 0) { 3562 freemsg(mp); 3563 BUMP_MIB(&is->is_rawip_mib, rawipOutErrors); 3564 goto done; 3565 } 3566 } 3567 mp = icmp_prepend_hdr(connp, ixa, ipp, &v6src, &v6dst, flowinfo, mp, 3568 &error); 3569 if (mp == NULL) { 3570 BUMP_MIB(&is->is_rawip_mib, rawipOutErrors); 3571 ASSERT(error != 0); 3572 goto done; 3573 } 3574 if (ixa->ixa_pktlen > IP_MAXPACKET) { 3575 error = EMSGSIZE; 3576 BUMP_MIB(&is->is_rawip_mib, rawipOutErrors); 3577 freemsg(mp); 3578 goto done; 3579 } 3580 3581 /* Policy might differ for different ICMP type/code */ 3582 mp = icmp_output_attach_policy(mp, connp, ixa); 3583 if (mp == NULL) { 3584 BUMP_MIB(&is->is_rawip_mib, rawipOutErrors); 3585 error = EHOSTUNREACH; /* IPsec policy failure */ 3586 goto done; 3587 } 3588 3589 /* We're done. Pass the packet to ip. */ 3590 BUMP_MIB(&is->is_rawip_mib, rawipOutDatagrams); 3591 3592 error = conn_ip_output(mp, ixa); 3593 if (!connp->conn_unspec_src) 3594 ixa->ixa_flags |= IXAF_VERIFY_SOURCE; 3595 /* No rawipOutErrors if an error since IP increases its error counter */ 3596 switch (error) { 3597 case 0: 3598 break; 3599 case EWOULDBLOCK: 3600 (void) ixa_check_drain_insert(connp, ixa); 3601 error = 0; 3602 break; 3603 case EADDRNOTAVAIL: 3604 /* 3605 * IXAF_VERIFY_SOURCE tells us to pick a better source. 3606 * Don't have the application see that errno 3607 */ 3608 error = ENETUNREACH; 3609 /* FALLTHRU */ 3610 default: 3611 mutex_enter(&connp->conn_lock); 3612 /* 3613 * Clear the source and v6lastdst so we call ip_attr_connect 3614 * for the next packet and try to pick a better source. 3615 */ 3616 if (connp->conn_mcbc_bind) 3617 connp->conn_saddr_v6 = ipv6_all_zeros; 3618 else 3619 connp->conn_saddr_v6 = connp->conn_bound_addr_v6; 3620 connp->conn_v6lastdst = ipv6_all_zeros; 3621 mutex_exit(&connp->conn_lock); 3622 break; 3623 } 3624 done: 3625 ixa->ixa_cred = connp->conn_cred; /* Restore */ 3626 ixa->ixa_cpid = connp->conn_cpid; 3627 ixa_refrele(ixa); 3628 ip_pkt_free(ipp); 3629 kmem_free(ipp, sizeof (*ipp)); 3630 return (error); 3631 } 3632 3633 /* 3634 * Handle sending an M_DATA for a connected socket. 3635 * Handles both IPv4 and IPv6. 3636 */ 3637 int 3638 icmp_output_connected(conn_t *connp, mblk_t *mp, cred_t *cr, pid_t pid) 3639 { 3640 icmp_t *icmp = connp->conn_icmp; 3641 icmp_stack_t *is = icmp->icmp_is; 3642 int error; 3643 ip_xmit_attr_t *ixa; 3644 boolean_t do_ipsec; 3645 3646 /* 3647 * If no other thread is using conn_ixa this just gets a reference to 3648 * conn_ixa. Otherwise we get a safe copy of conn_ixa. 3649 */ 3650 ixa = conn_get_ixa(connp, B_FALSE); 3651 if (ixa == NULL) { 3652 BUMP_MIB(&is->is_rawip_mib, rawipOutErrors); 3653 freemsg(mp); 3654 return (ENOMEM); 3655 } 3656 3657 ASSERT(cr != NULL); 3658 ixa->ixa_cred = cr; 3659 ixa->ixa_cpid = pid; 3660 3661 /* Defer IPsec if it might need to look at ICMP type/code */ 3662 switch (ixa->ixa_protocol) { 3663 case IPPROTO_ICMP: 3664 case IPPROTO_ICMPV6: 3665 do_ipsec = B_FALSE; 3666 break; 3667 default: 3668 do_ipsec = B_TRUE; 3669 } 3670 3671 mutex_enter(&connp->conn_lock); 3672 mp = icmp_prepend_header_template(connp, ixa, mp, 3673 &connp->conn_saddr_v6, connp->conn_flowinfo, &error); 3674 3675 if (mp == NULL) { 3676 ASSERT(error != 0); 3677 mutex_exit(&connp->conn_lock); 3678 ixa->ixa_cred = connp->conn_cred; /* Restore */ 3679 ixa->ixa_cpid = connp->conn_cpid; 3680 ixa_refrele(ixa); 3681 BUMP_MIB(&is->is_rawip_mib, rawipOutErrors); 3682 freemsg(mp); 3683 return (error); 3684 } 3685 3686 if (!do_ipsec) { 3687 /* Policy might differ for different ICMP type/code */ 3688 mp = icmp_output_attach_policy(mp, connp, ixa); 3689 if (mp == NULL) { 3690 mutex_exit(&connp->conn_lock); 3691 BUMP_MIB(&is->is_rawip_mib, rawipOutErrors); 3692 ixa->ixa_cred = connp->conn_cred; /* Restore */ 3693 ixa->ixa_cpid = connp->conn_cpid; 3694 ixa_refrele(ixa); 3695 return (EHOSTUNREACH); /* IPsec policy failure */ 3696 } 3697 } 3698 3699 /* 3700 * In case we got a safe copy of conn_ixa, or if opt_set made us a new 3701 * safe copy, then we need to fill in any pointers in it. 3702 */ 3703 if (ixa->ixa_ire == NULL) { 3704 in6_addr_t faddr, saddr; 3705 in6_addr_t nexthop; 3706 in_port_t fport; 3707 3708 saddr = connp->conn_saddr_v6; 3709 faddr = connp->conn_faddr_v6; 3710 fport = connp->conn_fport; 3711 ip_attr_nexthop(&connp->conn_xmit_ipp, ixa, &faddr, &nexthop); 3712 mutex_exit(&connp->conn_lock); 3713 3714 error = ip_attr_connect(connp, ixa, &saddr, &faddr, &nexthop, 3715 fport, NULL, NULL, IPDF_ALLOW_MCBC | IPDF_VERIFY_DST | 3716 (do_ipsec ? IPDF_IPSEC : 0)); 3717 switch (error) { 3718 case 0: 3719 break; 3720 case EADDRNOTAVAIL: 3721 /* 3722 * IXAF_VERIFY_SOURCE tells us to pick a better source. 3723 * Don't have the application see that errno 3724 */ 3725 error = ENETUNREACH; 3726 goto failed; 3727 case ENETDOWN: 3728 /* 3729 * Have !ipif_addr_ready address; drop packet silently 3730 * until we can get applications to not send until we 3731 * are ready. 3732 */ 3733 error = 0; 3734 goto failed; 3735 case EHOSTUNREACH: 3736 case ENETUNREACH: 3737 if (ixa->ixa_ire != NULL) { 3738 /* 3739 * Let conn_ip_output/ire_send_noroute return 3740 * the error and send any local ICMP error. 3741 */ 3742 error = 0; 3743 break; 3744 } 3745 /* FALLTHRU */ 3746 default: 3747 failed: 3748 ixa->ixa_cred = connp->conn_cred; /* Restore */ 3749 ixa->ixa_cpid = connp->conn_cpid; 3750 ixa_refrele(ixa); 3751 BUMP_MIB(&is->is_rawip_mib, rawipOutErrors); 3752 freemsg(mp); 3753 return (error); 3754 } 3755 } else { 3756 /* Done with conn_t */ 3757 mutex_exit(&connp->conn_lock); 3758 } 3759 3760 /* We're done. Pass the packet to ip. */ 3761 BUMP_MIB(&is->is_rawip_mib, rawipOutDatagrams); 3762 3763 error = conn_ip_output(mp, ixa); 3764 /* No rawipOutErrors if an error since IP increases its error counter */ 3765 switch (error) { 3766 case 0: 3767 break; 3768 case EWOULDBLOCK: 3769 (void) ixa_check_drain_insert(connp, ixa); 3770 error = 0; 3771 break; 3772 case EADDRNOTAVAIL: 3773 /* 3774 * IXAF_VERIFY_SOURCE tells us to pick a better source. 3775 * Don't have the application see that errno 3776 */ 3777 error = ENETUNREACH; 3778 break; 3779 } 3780 ixa->ixa_cred = connp->conn_cred; /* Restore */ 3781 ixa->ixa_cpid = connp->conn_cpid; 3782 ixa_refrele(ixa); 3783 return (error); 3784 } 3785 3786 /* 3787 * Handle sending an M_DATA to the last destination. 3788 * Handles both IPv4 and IPv6. 3789 * 3790 * NOTE: The caller must hold conn_lock and we drop it here. 3791 */ 3792 int 3793 icmp_output_lastdst(conn_t *connp, mblk_t *mp, cred_t *cr, pid_t pid, 3794 ip_xmit_attr_t *ixa) 3795 { 3796 icmp_t *icmp = connp->conn_icmp; 3797 icmp_stack_t *is = icmp->icmp_is; 3798 int error; 3799 boolean_t do_ipsec; 3800 3801 ASSERT(MUTEX_HELD(&connp->conn_lock)); 3802 ASSERT(ixa != NULL); 3803 3804 ASSERT(cr != NULL); 3805 ixa->ixa_cred = cr; 3806 ixa->ixa_cpid = pid; 3807 3808 /* Defer IPsec if it might need to look at ICMP type/code */ 3809 switch (ixa->ixa_protocol) { 3810 case IPPROTO_ICMP: 3811 case IPPROTO_ICMPV6: 3812 do_ipsec = B_FALSE; 3813 break; 3814 default: 3815 do_ipsec = B_TRUE; 3816 } 3817 3818 3819 mp = icmp_prepend_header_template(connp, ixa, mp, 3820 &connp->conn_v6lastsrc, connp->conn_lastflowinfo, &error); 3821 3822 if (mp == NULL) { 3823 ASSERT(error != 0); 3824 mutex_exit(&connp->conn_lock); 3825 ixa->ixa_cred = connp->conn_cred; /* Restore */ 3826 ixa->ixa_cpid = connp->conn_cpid; 3827 ixa_refrele(ixa); 3828 BUMP_MIB(&is->is_rawip_mib, rawipOutErrors); 3829 freemsg(mp); 3830 return (error); 3831 } 3832 3833 if (!do_ipsec) { 3834 /* Policy might differ for different ICMP type/code */ 3835 mp = icmp_output_attach_policy(mp, connp, ixa); 3836 if (mp == NULL) { 3837 mutex_exit(&connp->conn_lock); 3838 BUMP_MIB(&is->is_rawip_mib, rawipOutErrors); 3839 ixa->ixa_cred = connp->conn_cred; /* Restore */ 3840 ixa->ixa_cpid = connp->conn_cpid; 3841 ixa_refrele(ixa); 3842 return (EHOSTUNREACH); /* IPsec policy failure */ 3843 } 3844 } 3845 3846 /* 3847 * In case we got a safe copy of conn_ixa, or if opt_set made us a new 3848 * safe copy, then we need to fill in any pointers in it. 3849 */ 3850 if (ixa->ixa_ire == NULL) { 3851 in6_addr_t lastdst, lastsrc; 3852 in6_addr_t nexthop; 3853 in_port_t lastport; 3854 3855 lastsrc = connp->conn_v6lastsrc; 3856 lastdst = connp->conn_v6lastdst; 3857 lastport = connp->conn_lastdstport; 3858 ip_attr_nexthop(&connp->conn_xmit_ipp, ixa, &lastdst, &nexthop); 3859 mutex_exit(&connp->conn_lock); 3860 3861 error = ip_attr_connect(connp, ixa, &lastsrc, &lastdst, 3862 &nexthop, lastport, NULL, NULL, IPDF_ALLOW_MCBC | 3863 IPDF_VERIFY_DST | (do_ipsec ? IPDF_IPSEC : 0)); 3864 switch (error) { 3865 case 0: 3866 break; 3867 case EADDRNOTAVAIL: 3868 /* 3869 * IXAF_VERIFY_SOURCE tells us to pick a better source. 3870 * Don't have the application see that errno 3871 */ 3872 error = ENETUNREACH; 3873 goto failed; 3874 case ENETDOWN: 3875 /* 3876 * Have !ipif_addr_ready address; drop packet silently 3877 * until we can get applications to not send until we 3878 * are ready. 3879 */ 3880 error = 0; 3881 goto failed; 3882 case EHOSTUNREACH: 3883 case ENETUNREACH: 3884 if (ixa->ixa_ire != NULL) { 3885 /* 3886 * Let conn_ip_output/ire_send_noroute return 3887 * the error and send any local ICMP error. 3888 */ 3889 error = 0; 3890 break; 3891 } 3892 /* FALLTHRU */ 3893 default: 3894 failed: 3895 ixa->ixa_cred = connp->conn_cred; /* Restore */ 3896 ixa->ixa_cpid = connp->conn_cpid; 3897 ixa_refrele(ixa); 3898 BUMP_MIB(&is->is_rawip_mib, rawipOutErrors); 3899 freemsg(mp); 3900 return (error); 3901 } 3902 } else { 3903 /* Done with conn_t */ 3904 mutex_exit(&connp->conn_lock); 3905 } 3906 3907 /* We're done. Pass the packet to ip. */ 3908 BUMP_MIB(&is->is_rawip_mib, rawipOutDatagrams); 3909 error = conn_ip_output(mp, ixa); 3910 /* No rawipOutErrors if an error since IP increases its error counter */ 3911 switch (error) { 3912 case 0: 3913 break; 3914 case EWOULDBLOCK: 3915 (void) ixa_check_drain_insert(connp, ixa); 3916 error = 0; 3917 break; 3918 case EADDRNOTAVAIL: 3919 /* 3920 * IXAF_VERIFY_SOURCE tells us to pick a better source. 3921 * Don't have the application see that errno 3922 */ 3923 error = ENETUNREACH; 3924 /* FALLTHRU */ 3925 default: 3926 mutex_enter(&connp->conn_lock); 3927 /* 3928 * Clear the source and v6lastdst so we call ip_attr_connect 3929 * for the next packet and try to pick a better source. 3930 */ 3931 if (connp->conn_mcbc_bind) 3932 connp->conn_saddr_v6 = ipv6_all_zeros; 3933 else 3934 connp->conn_saddr_v6 = connp->conn_bound_addr_v6; 3935 connp->conn_v6lastdst = ipv6_all_zeros; 3936 mutex_exit(&connp->conn_lock); 3937 break; 3938 } 3939 ixa->ixa_cred = connp->conn_cred; /* Restore */ 3940 ixa->ixa_cpid = connp->conn_cpid; 3941 ixa_refrele(ixa); 3942 return (error); 3943 } 3944 3945 3946 /* 3947 * Prepend the header template and then fill in the source and 3948 * flowinfo. The caller needs to handle the destination address since 3949 * it's setting is different if rthdr or source route. 3950 * 3951 * Returns NULL is allocation failed or if the packet would exceed IP_MAXPACKET. 3952 * When it returns NULL it sets errorp. 3953 */ 3954 static mblk_t * 3955 icmp_prepend_header_template(conn_t *connp, ip_xmit_attr_t *ixa, mblk_t *mp, 3956 const in6_addr_t *v6src, uint32_t flowinfo, int *errorp) 3957 { 3958 icmp_t *icmp = connp->conn_icmp; 3959 icmp_stack_t *is = icmp->icmp_is; 3960 uint_t pktlen; 3961 uint_t copylen; 3962 uint8_t *iph; 3963 uint_t ip_hdr_length; 3964 uint32_t cksum; 3965 ip_pkt_t *ipp; 3966 3967 ASSERT(MUTEX_HELD(&connp->conn_lock)); 3968 3969 /* 3970 * Copy the header template. 3971 */ 3972 copylen = connp->conn_ht_iphc_len; 3973 pktlen = copylen + msgdsize(mp); 3974 if (pktlen > IP_MAXPACKET) { 3975 freemsg(mp); 3976 *errorp = EMSGSIZE; 3977 return (NULL); 3978 } 3979 ixa->ixa_pktlen = pktlen; 3980 3981 /* check/fix buffer config, setup pointers into it */ 3982 iph = mp->b_rptr - copylen; 3983 if (DB_REF(mp) != 1 || iph < DB_BASE(mp) || !OK_32PTR(iph)) { 3984 mblk_t *mp1; 3985 3986 mp1 = allocb(copylen + is->is_wroff_extra, BPRI_MED); 3987 if (mp1 == NULL) { 3988 freemsg(mp); 3989 *errorp = ENOMEM; 3990 return (NULL); 3991 } 3992 mp1->b_wptr = DB_LIM(mp1); 3993 mp1->b_cont = mp; 3994 mp = mp1; 3995 iph = (mp->b_wptr - copylen); 3996 } 3997 mp->b_rptr = iph; 3998 bcopy(connp->conn_ht_iphc, iph, copylen); 3999 ip_hdr_length = (uint_t)(connp->conn_ht_ulp - connp->conn_ht_iphc); 4000 4001 ixa->ixa_ip_hdr_length = ip_hdr_length; 4002 4003 /* 4004 * Prepare for ICMPv6 checksum done in IP. 4005 * 4006 * icmp_build_hdr_template has already massaged any routing header 4007 * and placed the result in conn_sum. 4008 * 4009 * We make it easy for IP to include our pseudo header 4010 * by putting our length (and any routing header adjustment) 4011 * in the ICMPv6 checksum field. 4012 */ 4013 cksum = pktlen - ip_hdr_length; 4014 4015 cksum += connp->conn_sum; 4016 cksum = (cksum >> 16) + (cksum & 0xFFFF); 4017 ASSERT(cksum < 0x10000); 4018 4019 ipp = &connp->conn_xmit_ipp; 4020 if (ixa->ixa_flags & IXAF_IS_IPV4) { 4021 ipha_t *ipha = (ipha_t *)iph; 4022 4023 ipha->ipha_length = htons((uint16_t)pktlen); 4024 4025 /* if IP_PKTINFO specified an addres it wins over bind() */ 4026 if ((ipp->ipp_fields & IPPF_ADDR) && 4027 IN6_IS_ADDR_V4MAPPED(&ipp->ipp_addr)) { 4028 ASSERT(ipp->ipp_addr_v4 != INADDR_ANY); 4029 ipha->ipha_src = ipp->ipp_addr_v4; 4030 } else { 4031 IN6_V4MAPPED_TO_IPADDR(v6src, ipha->ipha_src); 4032 } 4033 } else { 4034 ip6_t *ip6h = (ip6_t *)iph; 4035 uint_t cksum_offset = 0; 4036 4037 ip6h->ip6_plen = htons((uint16_t)(pktlen - IPV6_HDR_LEN)); 4038 4039 /* if IP_PKTINFO specified an addres it wins over bind() */ 4040 if ((ipp->ipp_fields & IPPF_ADDR) && 4041 !IN6_IS_ADDR_V4MAPPED(&ipp->ipp_addr)) { 4042 ASSERT(!IN6_IS_ADDR_UNSPECIFIED(&ipp->ipp_addr)); 4043 ip6h->ip6_src = ipp->ipp_addr; 4044 } else { 4045 ip6h->ip6_src = *v6src; 4046 } 4047 ip6h->ip6_vcf = 4048 (IPV6_DEFAULT_VERS_AND_FLOW & IPV6_VERS_AND_FLOW_MASK) | 4049 (flowinfo & ~IPV6_VERS_AND_FLOW_MASK); 4050 if (ipp->ipp_fields & IPPF_TCLASS) { 4051 /* Overrides the class part of flowinfo */ 4052 ip6h->ip6_vcf = IPV6_TCLASS_FLOW(ip6h->ip6_vcf, 4053 ipp->ipp_tclass); 4054 } 4055 4056 if (ixa->ixa_flags & IXAF_SET_ULP_CKSUM) { 4057 if (connp->conn_proto == IPPROTO_ICMPV6) { 4058 cksum_offset = ixa->ixa_ip_hdr_length + 4059 offsetof(icmp6_t, icmp6_cksum); 4060 } else if (ixa->ixa_flags & IXAF_SET_RAW_CKSUM) { 4061 cksum_offset = ixa->ixa_ip_hdr_length + 4062 ixa->ixa_raw_cksum_offset; 4063 } 4064 } 4065 if (cksum_offset != 0) { 4066 uint16_t *ptr; 4067 4068 /* Make sure the checksum fits in the first mblk */ 4069 if (cksum_offset + sizeof (short) > MBLKL(mp)) { 4070 mblk_t *mp1; 4071 4072 mp1 = msgpullup(mp, 4073 cksum_offset + sizeof (short)); 4074 freemsg(mp); 4075 if (mp1 == NULL) { 4076 *errorp = ENOMEM; 4077 return (NULL); 4078 } 4079 mp = mp1; 4080 iph = mp->b_rptr; 4081 ip6h = (ip6_t *)iph; 4082 } 4083 ptr = (uint16_t *)(mp->b_rptr + cksum_offset); 4084 *ptr = htons(cksum); 4085 } 4086 } 4087 4088 return (mp); 4089 } 4090 4091 /* 4092 * This routine handles all messages passed downstream. It either 4093 * consumes the message or passes it downstream; it never queues a 4094 * a message. 4095 */ 4096 void 4097 icmp_wput(queue_t *q, mblk_t *mp) 4098 { 4099 sin6_t *sin6; 4100 sin_t *sin = NULL; 4101 uint_t srcid; 4102 conn_t *connp = Q_TO_CONN(q); 4103 icmp_t *icmp = connp->conn_icmp; 4104 int error = 0; 4105 struct sockaddr *addr = NULL; 4106 socklen_t addrlen; 4107 icmp_stack_t *is = icmp->icmp_is; 4108 struct T_unitdata_req *tudr; 4109 mblk_t *data_mp; 4110 cred_t *cr; 4111 pid_t pid; 4112 4113 /* 4114 * We directly handle several cases here: T_UNITDATA_REQ message 4115 * coming down as M_PROTO/M_PCPROTO and M_DATA messages for connected 4116 * socket. 4117 */ 4118 switch (DB_TYPE(mp)) { 4119 case M_DATA: 4120 /* sockfs never sends down M_DATA */ 4121 BUMP_MIB(&is->is_rawip_mib, rawipOutErrors); 4122 freemsg(mp); 4123 return; 4124 4125 case M_PROTO: 4126 case M_PCPROTO: 4127 tudr = (struct T_unitdata_req *)mp->b_rptr; 4128 if (MBLKL(mp) < sizeof (*tudr) || 4129 ((t_primp_t)mp->b_rptr)->type != T_UNITDATA_REQ) { 4130 icmp_wput_other(q, mp); 4131 return; 4132 } 4133 break; 4134 4135 default: 4136 icmp_wput_other(q, mp); 4137 return; 4138 } 4139 4140 /* Handle valid T_UNITDATA_REQ here */ 4141 data_mp = mp->b_cont; 4142 if (data_mp == NULL) { 4143 error = EPROTO; 4144 goto ud_error2; 4145 } 4146 mp->b_cont = NULL; 4147 4148 if (!MBLKIN(mp, 0, tudr->DEST_offset + tudr->DEST_length)) { 4149 error = EADDRNOTAVAIL; 4150 goto ud_error2; 4151 } 4152 4153 /* 4154 * All Solaris components should pass a db_credp 4155 * for this message, hence we ASSERT. 4156 * On production kernels we return an error to be robust against 4157 * random streams modules sitting on top of us. 4158 */ 4159 cr = msg_getcred(mp, &pid); 4160 ASSERT(cr != NULL); 4161 if (cr == NULL) { 4162 error = EINVAL; 4163 goto ud_error2; 4164 } 4165 4166 /* 4167 * If a port has not been bound to the stream, fail. 4168 * This is not a problem when sockfs is directly 4169 * above us, because it will ensure that the socket 4170 * is first bound before allowing data to be sent. 4171 */ 4172 if (icmp->icmp_state == TS_UNBND) { 4173 error = EPROTO; 4174 goto ud_error2; 4175 } 4176 addr = (struct sockaddr *)&mp->b_rptr[tudr->DEST_offset]; 4177 addrlen = tudr->DEST_length; 4178 4179 switch (connp->conn_family) { 4180 case AF_INET6: 4181 sin6 = (sin6_t *)addr; 4182 if (!OK_32PTR((char *)sin6) || (addrlen != sizeof (sin6_t)) || 4183 (sin6->sin6_family != AF_INET6)) { 4184 error = EADDRNOTAVAIL; 4185 goto ud_error2; 4186 } 4187 4188 /* No support for mapped addresses on raw sockets */ 4189 if (IN6_IS_ADDR_V4MAPPED(&sin6->sin6_addr)) { 4190 error = EADDRNOTAVAIL; 4191 goto ud_error2; 4192 } 4193 srcid = sin6->__sin6_src_id; 4194 4195 /* 4196 * If the local address is a mapped address return 4197 * an error. 4198 * It would be possible to send an IPv6 packet but the 4199 * response would never make it back to the application 4200 * since it is bound to a mapped address. 4201 */ 4202 if (IN6_IS_ADDR_V4MAPPED(&connp->conn_saddr_v6)) { 4203 error = EADDRNOTAVAIL; 4204 goto ud_error2; 4205 } 4206 4207 if (IN6_IS_ADDR_UNSPECIFIED(&sin6->sin6_addr)) 4208 sin6->sin6_addr = ipv6_loopback; 4209 4210 if (tudr->OPT_length != 0) { 4211 /* 4212 * If we are connected then the destination needs to be 4213 * the same as the connected one. 4214 */ 4215 if (icmp->icmp_state == TS_DATA_XFER && 4216 !conn_same_as_last_v6(connp, sin6)) { 4217 error = EISCONN; 4218 goto ud_error2; 4219 } 4220 error = icmp_output_ancillary(connp, NULL, sin6, 4221 data_mp, mp, NULL, cr, pid); 4222 } else { 4223 ip_xmit_attr_t *ixa; 4224 4225 /* 4226 * We have to allocate an ip_xmit_attr_t before we grab 4227 * conn_lock and we need to hold conn_lock once we've 4228 * checked conn_same_as_last_v6 to handle concurrent 4229 * send* calls on a socket. 4230 */ 4231 ixa = conn_get_ixa(connp, B_FALSE); 4232 if (ixa == NULL) { 4233 error = ENOMEM; 4234 goto ud_error2; 4235 } 4236 mutex_enter(&connp->conn_lock); 4237 4238 if (conn_same_as_last_v6(connp, sin6) && 4239 connp->conn_lastsrcid == srcid && 4240 ipsec_outbound_policy_current(ixa)) { 4241 /* icmp_output_lastdst drops conn_lock */ 4242 error = icmp_output_lastdst(connp, data_mp, cr, 4243 pid, ixa); 4244 } else { 4245 /* icmp_output_newdst drops conn_lock */ 4246 error = icmp_output_newdst(connp, data_mp, NULL, 4247 sin6, cr, pid, ixa); 4248 } 4249 ASSERT(MUTEX_NOT_HELD(&connp->conn_lock)); 4250 } 4251 if (error == 0) { 4252 freeb(mp); 4253 return; 4254 } 4255 break; 4256 4257 case AF_INET: 4258 sin = (sin_t *)addr; 4259 if ((!OK_32PTR((char *)sin) || addrlen != sizeof (sin_t)) || 4260 (sin->sin_family != AF_INET)) { 4261 error = EADDRNOTAVAIL; 4262 goto ud_error2; 4263 } 4264 if (sin->sin_addr.s_addr == INADDR_ANY) 4265 sin->sin_addr.s_addr = htonl(INADDR_LOOPBACK); 4266 4267 /* Protocol 255 contains full IP headers */ 4268 /* Read without holding lock */ 4269 if (icmp->icmp_hdrincl) { 4270 if (MBLKL(data_mp) < IP_SIMPLE_HDR_LENGTH) { 4271 if (!pullupmsg(data_mp, IP_SIMPLE_HDR_LENGTH)) { 4272 error = EINVAL; 4273 goto ud_error2; 4274 } 4275 } 4276 error = icmp_output_hdrincl(connp, data_mp, cr, pid); 4277 if (error == 0) { 4278 freeb(mp); 4279 return; 4280 } 4281 /* data_mp consumed above */ 4282 data_mp = NULL; 4283 goto ud_error2; 4284 } 4285 4286 if (tudr->OPT_length != 0) { 4287 /* 4288 * If we are connected then the destination needs to be 4289 * the same as the connected one. 4290 */ 4291 if (icmp->icmp_state == TS_DATA_XFER && 4292 !conn_same_as_last_v4(connp, sin)) { 4293 error = EISCONN; 4294 goto ud_error2; 4295 } 4296 error = icmp_output_ancillary(connp, sin, NULL, 4297 data_mp, mp, NULL, cr, pid); 4298 } else { 4299 ip_xmit_attr_t *ixa; 4300 4301 /* 4302 * We have to allocate an ip_xmit_attr_t before we grab 4303 * conn_lock and we need to hold conn_lock once we've 4304 * checked conn_same_as_last_v4 to handle concurrent 4305 * send* calls on a socket. 4306 */ 4307 ixa = conn_get_ixa(connp, B_FALSE); 4308 if (ixa == NULL) { 4309 error = ENOMEM; 4310 goto ud_error2; 4311 } 4312 mutex_enter(&connp->conn_lock); 4313 4314 if (conn_same_as_last_v4(connp, sin) && 4315 ipsec_outbound_policy_current(ixa)) { 4316 /* icmp_output_lastdst drops conn_lock */ 4317 error = icmp_output_lastdst(connp, data_mp, cr, 4318 pid, ixa); 4319 } else { 4320 /* icmp_output_newdst drops conn_lock */ 4321 error = icmp_output_newdst(connp, data_mp, sin, 4322 NULL, cr, pid, ixa); 4323 } 4324 ASSERT(MUTEX_NOT_HELD(&connp->conn_lock)); 4325 } 4326 if (error == 0) { 4327 freeb(mp); 4328 return; 4329 } 4330 break; 4331 } 4332 ASSERT(mp != NULL); 4333 /* mp is freed by the following routine */ 4334 icmp_ud_err(q, mp, (t_scalar_t)error); 4335 return; 4336 4337 ud_error2: 4338 BUMP_MIB(&is->is_rawip_mib, rawipOutErrors); 4339 freemsg(data_mp); 4340 ASSERT(mp != NULL); 4341 /* mp is freed by the following routine */ 4342 icmp_ud_err(q, mp, (t_scalar_t)error); 4343 } 4344 4345 /* 4346 * Handle the case of the IP address or flow label being different 4347 * for both IPv4 and IPv6. 4348 * 4349 * NOTE: The caller must hold conn_lock and we drop it here. 4350 */ 4351 static int 4352 icmp_output_newdst(conn_t *connp, mblk_t *data_mp, sin_t *sin, sin6_t *sin6, 4353 cred_t *cr, pid_t pid, ip_xmit_attr_t *ixa) 4354 { 4355 icmp_t *icmp = connp->conn_icmp; 4356 icmp_stack_t *is = icmp->icmp_is; 4357 int error; 4358 ip_xmit_attr_t *oldixa; 4359 boolean_t do_ipsec; 4360 uint_t srcid; 4361 uint32_t flowinfo; 4362 in6_addr_t v6src; 4363 in6_addr_t v6dst; 4364 in6_addr_t v6nexthop; 4365 in_port_t dstport; 4366 4367 ASSERT(MUTEX_HELD(&connp->conn_lock)); 4368 ASSERT(ixa != NULL); 4369 4370 /* 4371 * We hold conn_lock across all the use and modifications of 4372 * the conn_lastdst, conn_ixa, and conn_xmit_ipp to ensure that they 4373 * stay consistent. 4374 */ 4375 4376 ASSERT(cr != NULL); 4377 ixa->ixa_cred = cr; 4378 ixa->ixa_cpid = pid; 4379 if (is_system_labeled()) { 4380 /* We need to restart with a label based on the cred */ 4381 ip_xmit_attr_restore_tsl(ixa, ixa->ixa_cred); 4382 } 4383 /* 4384 * If we are connected then the destination needs to be the 4385 * same as the connected one, which is not the case here since we 4386 * checked for that above. 4387 */ 4388 if (icmp->icmp_state == TS_DATA_XFER) { 4389 mutex_exit(&connp->conn_lock); 4390 error = EISCONN; 4391 goto ud_error; 4392 } 4393 4394 /* In case previous destination was multicast or multirt */ 4395 ip_attr_newdst(ixa); 4396 4397 /* 4398 * If laddr is unspecified then we look at sin6_src_id. 4399 * We will give precedence to a source address set with IPV6_PKTINFO 4400 * (aka IPPF_ADDR) but that is handled in build_hdrs. However, we don't 4401 * want ip_attr_connect to select a source (since it can fail) when 4402 * IPV6_PKTINFO is specified. 4403 * If this doesn't result in a source address then we get a source 4404 * from ip_attr_connect() below. 4405 */ 4406 v6src = connp->conn_saddr_v6; 4407 if (sin != NULL) { 4408 IN6_IPADDR_TO_V4MAPPED(sin->sin_addr.s_addr, &v6dst); 4409 dstport = sin->sin_port; 4410 flowinfo = 0; 4411 srcid = 0; 4412 ixa->ixa_flags &= ~IXAF_SCOPEID_SET; 4413 if (srcid != 0 && V4_PART_OF_V6(&v6src) == INADDR_ANY) { 4414 ip_srcid_find_id(srcid, &v6src, IPCL_ZONEID(connp), 4415 connp->conn_netstack); 4416 } 4417 ixa->ixa_flags |= IXAF_IS_IPV4; 4418 } else { 4419 v6dst = sin6->sin6_addr; 4420 dstport = sin6->sin6_port; 4421 flowinfo = sin6->sin6_flowinfo; 4422 srcid = sin6->__sin6_src_id; 4423 if (IN6_IS_ADDR_LINKSCOPE(&v6dst) && sin6->sin6_scope_id != 0) { 4424 ixa->ixa_scopeid = sin6->sin6_scope_id; 4425 ixa->ixa_flags |= IXAF_SCOPEID_SET; 4426 } else { 4427 ixa->ixa_flags &= ~IXAF_SCOPEID_SET; 4428 } 4429 if (srcid != 0 && IN6_IS_ADDR_UNSPECIFIED(&v6src)) { 4430 ip_srcid_find_id(srcid, &v6src, IPCL_ZONEID(connp), 4431 connp->conn_netstack); 4432 } 4433 if (IN6_IS_ADDR_V4MAPPED(&v6dst)) 4434 ixa->ixa_flags |= IXAF_IS_IPV4; 4435 else 4436 ixa->ixa_flags &= ~IXAF_IS_IPV4; 4437 } 4438 /* Handle IPV6_PKTINFO setting source address. */ 4439 if (IN6_IS_ADDR_UNSPECIFIED(&v6src) && 4440 (connp->conn_xmit_ipp.ipp_fields & IPPF_ADDR)) { 4441 ip_pkt_t *ipp = &connp->conn_xmit_ipp; 4442 4443 if (ixa->ixa_flags & IXAF_IS_IPV4) { 4444 if (IN6_IS_ADDR_V4MAPPED(&ipp->ipp_addr)) 4445 v6src = ipp->ipp_addr; 4446 } else { 4447 if (!IN6_IS_ADDR_V4MAPPED(&ipp->ipp_addr)) 4448 v6src = ipp->ipp_addr; 4449 } 4450 } 4451 4452 /* Defer IPsec if it might need to look at ICMP type/code */ 4453 switch (ixa->ixa_protocol) { 4454 case IPPROTO_ICMP: 4455 case IPPROTO_ICMPV6: 4456 do_ipsec = B_FALSE; 4457 break; 4458 default: 4459 do_ipsec = B_TRUE; 4460 } 4461 4462 ip_attr_nexthop(&connp->conn_xmit_ipp, ixa, &v6dst, &v6nexthop); 4463 mutex_exit(&connp->conn_lock); 4464 4465 error = ip_attr_connect(connp, ixa, &v6src, &v6dst, &v6nexthop, dstport, 4466 &v6src, NULL, IPDF_ALLOW_MCBC | IPDF_VERIFY_DST | 4467 (do_ipsec ? IPDF_IPSEC : 0)); 4468 switch (error) { 4469 case 0: 4470 break; 4471 case EADDRNOTAVAIL: 4472 /* 4473 * IXAF_VERIFY_SOURCE tells us to pick a better source. 4474 * Don't have the application see that errno 4475 */ 4476 error = ENETUNREACH; 4477 goto failed; 4478 case ENETDOWN: 4479 /* 4480 * Have !ipif_addr_ready address; drop packet silently 4481 * until we can get applications to not send until we 4482 * are ready. 4483 */ 4484 error = 0; 4485 goto failed; 4486 case EHOSTUNREACH: 4487 case ENETUNREACH: 4488 if (ixa->ixa_ire != NULL) { 4489 /* 4490 * Let conn_ip_output/ire_send_noroute return 4491 * the error and send any local ICMP error. 4492 */ 4493 error = 0; 4494 break; 4495 } 4496 /* FALLTHRU */ 4497 default: 4498 failed: 4499 goto ud_error; 4500 } 4501 4502 mutex_enter(&connp->conn_lock); 4503 /* 4504 * While we dropped the lock some other thread might have connected 4505 * this socket. If so we bail out with EISCONN to ensure that the 4506 * connecting thread is the one that updates conn_ixa, conn_ht_* 4507 * and conn_*last*. 4508 */ 4509 if (icmp->icmp_state == TS_DATA_XFER) { 4510 mutex_exit(&connp->conn_lock); 4511 error = EISCONN; 4512 goto ud_error; 4513 } 4514 4515 /* 4516 * We need to rebuild the headers if 4517 * - we are labeling packets (could be different for different 4518 * destinations) 4519 * - we have a source route (or routing header) since we need to 4520 * massage that to get the pseudo-header checksum 4521 * - a socket option with COA_HEADER_CHANGED has been set which 4522 * set conn_v6lastdst to zero. 4523 * 4524 * Otherwise the prepend function will just update the src, dst, 4525 * and flow label. 4526 */ 4527 if (is_system_labeled()) { 4528 /* TX MLP requires SCM_UCRED and don't have that here */ 4529 if (connp->conn_mlp_type != mlptSingle) { 4530 mutex_exit(&connp->conn_lock); 4531 error = ECONNREFUSED; 4532 goto ud_error; 4533 } 4534 /* 4535 * Check whether Trusted Solaris policy allows communication 4536 * with this host, and pretend that the destination is 4537 * unreachable if not. 4538 * Compute any needed label and place it in ipp_label_v4/v6. 4539 * 4540 * Later conn_build_hdr_template/conn_prepend_hdr takes 4541 * ipp_label_v4/v6 to form the packet. 4542 * 4543 * Tsol note: Since we hold conn_lock we know no other 4544 * thread manipulates conn_xmit_ipp. 4545 */ 4546 error = conn_update_label(connp, ixa, &v6dst, 4547 &connp->conn_xmit_ipp); 4548 if (error != 0) { 4549 mutex_exit(&connp->conn_lock); 4550 goto ud_error; 4551 } 4552 /* Rebuild the header template */ 4553 error = icmp_build_hdr_template(connp, &v6src, &v6dst, 4554 flowinfo); 4555 if (error != 0) { 4556 mutex_exit(&connp->conn_lock); 4557 goto ud_error; 4558 } 4559 } else if (connp->conn_xmit_ipp.ipp_fields & 4560 (IPPF_IPV4_OPTIONS|IPPF_RTHDR) || 4561 IN6_IS_ADDR_UNSPECIFIED(&connp->conn_v6lastdst)) { 4562 /* Rebuild the header template */ 4563 error = icmp_build_hdr_template(connp, &v6src, &v6dst, 4564 flowinfo); 4565 if (error != 0) { 4566 mutex_exit(&connp->conn_lock); 4567 goto ud_error; 4568 } 4569 } else { 4570 /* Simply update the destination address if no source route */ 4571 if (ixa->ixa_flags & IXAF_IS_IPV4) { 4572 ipha_t *ipha = (ipha_t *)connp->conn_ht_iphc; 4573 4574 IN6_V4MAPPED_TO_IPADDR(&v6dst, ipha->ipha_dst); 4575 if (ixa->ixa_flags & IXAF_PMTU_IPV4_DF) { 4576 ipha->ipha_fragment_offset_and_flags |= 4577 IPH_DF_HTONS; 4578 } else { 4579 ipha->ipha_fragment_offset_and_flags &= 4580 ~IPH_DF_HTONS; 4581 } 4582 } else { 4583 ip6_t *ip6h = (ip6_t *)connp->conn_ht_iphc; 4584 ip6h->ip6_dst = v6dst; 4585 } 4586 } 4587 4588 /* 4589 * Remember the dst etc which corresponds to the built header 4590 * template and conn_ixa. 4591 */ 4592 oldixa = conn_replace_ixa(connp, ixa); 4593 connp->conn_v6lastdst = v6dst; 4594 connp->conn_lastflowinfo = flowinfo; 4595 connp->conn_lastscopeid = ixa->ixa_scopeid; 4596 connp->conn_lastsrcid = srcid; 4597 /* Also remember a source to use together with lastdst */ 4598 connp->conn_v6lastsrc = v6src; 4599 4600 data_mp = icmp_prepend_header_template(connp, ixa, data_mp, &v6src, 4601 flowinfo, &error); 4602 4603 /* Done with conn_t */ 4604 mutex_exit(&connp->conn_lock); 4605 ixa_refrele(oldixa); 4606 4607 if (data_mp == NULL) { 4608 ASSERT(error != 0); 4609 goto ud_error; 4610 } 4611 4612 if (!do_ipsec) { 4613 /* Policy might differ for different ICMP type/code */ 4614 data_mp = icmp_output_attach_policy(data_mp, connp, ixa); 4615 if (data_mp == NULL) { 4616 BUMP_MIB(&is->is_rawip_mib, rawipOutErrors); 4617 error = EHOSTUNREACH; /* IPsec policy failure */ 4618 goto done; 4619 } 4620 } 4621 4622 /* We're done. Pass the packet to ip. */ 4623 BUMP_MIB(&is->is_rawip_mib, rawipOutDatagrams); 4624 4625 error = conn_ip_output(data_mp, ixa); 4626 /* No rawipOutErrors if an error since IP increases its error counter */ 4627 switch (error) { 4628 case 0: 4629 break; 4630 case EWOULDBLOCK: 4631 (void) ixa_check_drain_insert(connp, ixa); 4632 error = 0; 4633 break; 4634 case EADDRNOTAVAIL: 4635 /* 4636 * IXAF_VERIFY_SOURCE tells us to pick a better source. 4637 * Don't have the application see that errno 4638 */ 4639 error = ENETUNREACH; 4640 /* FALLTHRU */ 4641 default: 4642 mutex_enter(&connp->conn_lock); 4643 /* 4644 * Clear the source and v6lastdst so we call ip_attr_connect 4645 * for the next packet and try to pick a better source. 4646 */ 4647 if (connp->conn_mcbc_bind) 4648 connp->conn_saddr_v6 = ipv6_all_zeros; 4649 else 4650 connp->conn_saddr_v6 = connp->conn_bound_addr_v6; 4651 connp->conn_v6lastdst = ipv6_all_zeros; 4652 mutex_exit(&connp->conn_lock); 4653 break; 4654 } 4655 done: 4656 ixa->ixa_cred = connp->conn_cred; /* Restore */ 4657 ixa->ixa_cpid = connp->conn_cpid; 4658 ixa_refrele(ixa); 4659 return (error); 4660 4661 ud_error: 4662 ixa->ixa_cred = connp->conn_cred; /* Restore */ 4663 ixa->ixa_cpid = connp->conn_cpid; 4664 ixa_refrele(ixa); 4665 4666 BUMP_MIB(&is->is_rawip_mib, rawipOutErrors); 4667 freemsg(data_mp); 4668 return (error); 4669 } 4670 4671 /* ARGSUSED */ 4672 static void 4673 icmp_wput_fallback(queue_t *q, mblk_t *mp) 4674 { 4675 #ifdef DEBUG 4676 cmn_err(CE_CONT, "icmp_wput_fallback: Message during fallback \n"); 4677 #endif 4678 freemsg(mp); 4679 } 4680 4681 static void 4682 icmp_wput_other(queue_t *q, mblk_t *mp) 4683 { 4684 uchar_t *rptr = mp->b_rptr; 4685 struct iocblk *iocp; 4686 conn_t *connp = Q_TO_CONN(q); 4687 icmp_t *icmp = connp->conn_icmp; 4688 icmp_stack_t *is = icmp->icmp_is; 4689 cred_t *cr; 4690 4691 switch (mp->b_datap->db_type) { 4692 case M_PROTO: 4693 case M_PCPROTO: 4694 if (mp->b_wptr - rptr < sizeof (t_scalar_t)) { 4695 /* 4696 * If the message does not contain a PRIM_type, 4697 * throw it away. 4698 */ 4699 freemsg(mp); 4700 return; 4701 } 4702 switch (((t_primp_t)rptr)->type) { 4703 case T_ADDR_REQ: 4704 icmp_addr_req(q, mp); 4705 return; 4706 case O_T_BIND_REQ: 4707 case T_BIND_REQ: 4708 icmp_tpi_bind(q, mp); 4709 return; 4710 case T_CONN_REQ: 4711 icmp_tpi_connect(q, mp); 4712 return; 4713 case T_CAPABILITY_REQ: 4714 icmp_capability_req(q, mp); 4715 return; 4716 case T_INFO_REQ: 4717 icmp_info_req(q, mp); 4718 return; 4719 case T_UNITDATA_REQ: 4720 /* 4721 * If a T_UNITDATA_REQ gets here, the address must 4722 * be bad. Valid T_UNITDATA_REQs are handled 4723 * in icmp_wput. 4724 */ 4725 icmp_ud_err(q, mp, EADDRNOTAVAIL); 4726 return; 4727 case T_UNBIND_REQ: 4728 icmp_tpi_unbind(q, mp); 4729 return; 4730 case T_SVR4_OPTMGMT_REQ: 4731 /* 4732 * All Solaris components should pass a db_credp 4733 * for this TPI message, hence we ASSERT. 4734 * But in case there is some other M_PROTO that looks 4735 * like a TPI message sent by some other kernel 4736 * component, we check and return an error. 4737 */ 4738 cr = msg_getcred(mp, NULL); 4739 ASSERT(cr != NULL); 4740 if (cr == NULL) { 4741 icmp_err_ack(q, mp, TSYSERR, EINVAL); 4742 return; 4743 } 4744 4745 if (!snmpcom_req(q, mp, icmp_snmp_set, ip_snmp_get, 4746 cr)) { 4747 svr4_optcom_req(q, mp, cr, &icmp_opt_obj); 4748 } 4749 return; 4750 4751 case T_OPTMGMT_REQ: 4752 /* 4753 * All Solaris components should pass a db_credp 4754 * for this TPI message, hence we ASSERT. 4755 * But in case there is some other M_PROTO that looks 4756 * like a TPI message sent by some other kernel 4757 * component, we check and return an error. 4758 */ 4759 cr = msg_getcred(mp, NULL); 4760 ASSERT(cr != NULL); 4761 if (cr == NULL) { 4762 icmp_err_ack(q, mp, TSYSERR, EINVAL); 4763 return; 4764 } 4765 tpi_optcom_req(q, mp, cr, &icmp_opt_obj); 4766 return; 4767 4768 case T_DISCON_REQ: 4769 icmp_tpi_disconnect(q, mp); 4770 return; 4771 4772 /* The following TPI message is not supported by icmp. */ 4773 case O_T_CONN_RES: 4774 case T_CONN_RES: 4775 icmp_err_ack(q, mp, TNOTSUPPORT, 0); 4776 return; 4777 4778 /* The following 3 TPI requests are illegal for icmp. */ 4779 case T_DATA_REQ: 4780 case T_EXDATA_REQ: 4781 case T_ORDREL_REQ: 4782 icmp_err_ack(q, mp, TNOTSUPPORT, 0); 4783 return; 4784 default: 4785 break; 4786 } 4787 break; 4788 case M_FLUSH: 4789 if (*rptr & FLUSHW) 4790 flushq(q, FLUSHDATA); 4791 break; 4792 case M_IOCTL: 4793 iocp = (struct iocblk *)mp->b_rptr; 4794 switch (iocp->ioc_cmd) { 4795 case TI_GETPEERNAME: 4796 if (icmp->icmp_state != TS_DATA_XFER) { 4797 /* 4798 * If a default destination address has not 4799 * been associated with the stream, then we 4800 * don't know the peer's name. 4801 */ 4802 iocp->ioc_error = ENOTCONN; 4803 iocp->ioc_count = 0; 4804 mp->b_datap->db_type = M_IOCACK; 4805 qreply(q, mp); 4806 return; 4807 } 4808 /* FALLTHRU */ 4809 case TI_GETMYNAME: 4810 /* 4811 * For TI_GETPEERNAME and TI_GETMYNAME, we first 4812 * need to copyin the user's strbuf structure. 4813 * Processing will continue in the M_IOCDATA case 4814 * below. 4815 */ 4816 mi_copyin(q, mp, NULL, 4817 SIZEOF_STRUCT(strbuf, iocp->ioc_flag)); 4818 return; 4819 case ND_SET: 4820 /* nd_getset performs the necessary checking */ 4821 case ND_GET: 4822 if (nd_getset(q, is->is_nd, mp)) { 4823 qreply(q, mp); 4824 return; 4825 } 4826 break; 4827 default: 4828 break; 4829 } 4830 break; 4831 case M_IOCDATA: 4832 icmp_wput_iocdata(q, mp); 4833 return; 4834 default: 4835 /* Unrecognized messages are passed through without change. */ 4836 break; 4837 } 4838 ip_wput_nondata(q, mp); 4839 } 4840 4841 /* 4842 * icmp_wput_iocdata is called by icmp_wput_other to handle all M_IOCDATA 4843 * messages. 4844 */ 4845 static void 4846 icmp_wput_iocdata(queue_t *q, mblk_t *mp) 4847 { 4848 mblk_t *mp1; 4849 STRUCT_HANDLE(strbuf, sb); 4850 uint_t addrlen; 4851 conn_t *connp = Q_TO_CONN(q); 4852 icmp_t *icmp = connp->conn_icmp; 4853 4854 /* Make sure it is one of ours. */ 4855 switch (((struct iocblk *)mp->b_rptr)->ioc_cmd) { 4856 case TI_GETMYNAME: 4857 case TI_GETPEERNAME: 4858 break; 4859 default: 4860 ip_wput_nondata(q, mp); 4861 return; 4862 } 4863 4864 switch (mi_copy_state(q, mp, &mp1)) { 4865 case -1: 4866 return; 4867 case MI_COPY_CASE(MI_COPY_IN, 1): 4868 break; 4869 case MI_COPY_CASE(MI_COPY_OUT, 1): 4870 /* 4871 * The address has been copied out, so now 4872 * copyout the strbuf. 4873 */ 4874 mi_copyout(q, mp); 4875 return; 4876 case MI_COPY_CASE(MI_COPY_OUT, 2): 4877 /* 4878 * The address and strbuf have been copied out. 4879 * We're done, so just acknowledge the original 4880 * M_IOCTL. 4881 */ 4882 mi_copy_done(q, mp, 0); 4883 return; 4884 default: 4885 /* 4886 * Something strange has happened, so acknowledge 4887 * the original M_IOCTL with an EPROTO error. 4888 */ 4889 mi_copy_done(q, mp, EPROTO); 4890 return; 4891 } 4892 4893 /* 4894 * Now we have the strbuf structure for TI_GETMYNAME 4895 * and TI_GETPEERNAME. Next we copyout the requested 4896 * address and then we'll copyout the strbuf. 4897 */ 4898 STRUCT_SET_HANDLE(sb, ((struct iocblk *)mp->b_rptr)->ioc_flag, 4899 (void *)mp1->b_rptr); 4900 4901 if (connp->conn_family == AF_INET) 4902 addrlen = sizeof (sin_t); 4903 else 4904 addrlen = sizeof (sin6_t); 4905 4906 if (STRUCT_FGET(sb, maxlen) < addrlen) { 4907 mi_copy_done(q, mp, EINVAL); 4908 return; 4909 } 4910 switch (((struct iocblk *)mp->b_rptr)->ioc_cmd) { 4911 case TI_GETMYNAME: 4912 break; 4913 case TI_GETPEERNAME: 4914 if (icmp->icmp_state != TS_DATA_XFER) { 4915 mi_copy_done(q, mp, ENOTCONN); 4916 return; 4917 } 4918 break; 4919 default: 4920 mi_copy_done(q, mp, EPROTO); 4921 return; 4922 } 4923 mp1 = mi_copyout_alloc(q, mp, STRUCT_FGETP(sb, buf), addrlen, B_TRUE); 4924 if (!mp1) 4925 return; 4926 4927 STRUCT_FSET(sb, len, addrlen); 4928 switch (((struct iocblk *)mp->b_rptr)->ioc_cmd) { 4929 case TI_GETMYNAME: 4930 (void) conn_getsockname(connp, (struct sockaddr *)mp1->b_wptr, 4931 &addrlen); 4932 break; 4933 case TI_GETPEERNAME: 4934 (void) conn_getpeername(connp, (struct sockaddr *)mp1->b_wptr, 4935 &addrlen); 4936 break; 4937 } 4938 mp1->b_wptr += addrlen; 4939 /* Copy out the address */ 4940 mi_copyout(q, mp); 4941 } 4942 4943 void 4944 icmp_ddi_g_init(void) 4945 { 4946 icmp_max_optsize = optcom_max_optsize(icmp_opt_obj.odb_opt_des_arr, 4947 icmp_opt_obj.odb_opt_arr_cnt); 4948 4949 /* 4950 * We want to be informed each time a stack is created or 4951 * destroyed in the kernel, so we can maintain the 4952 * set of icmp_stack_t's. 4953 */ 4954 netstack_register(NS_ICMP, rawip_stack_init, NULL, rawip_stack_fini); 4955 } 4956 4957 void 4958 icmp_ddi_g_destroy(void) 4959 { 4960 netstack_unregister(NS_ICMP); 4961 } 4962 4963 #define INET_NAME "ip" 4964 4965 /* 4966 * Initialize the ICMP stack instance. 4967 */ 4968 static void * 4969 rawip_stack_init(netstackid_t stackid, netstack_t *ns) 4970 { 4971 icmp_stack_t *is; 4972 icmpparam_t *pa; 4973 int error = 0; 4974 major_t major; 4975 4976 is = (icmp_stack_t *)kmem_zalloc(sizeof (*is), KM_SLEEP); 4977 is->is_netstack = ns; 4978 4979 pa = (icmpparam_t *)kmem_alloc(sizeof (icmp_param_arr), KM_SLEEP); 4980 is->is_param_arr = pa; 4981 bcopy(icmp_param_arr, is->is_param_arr, sizeof (icmp_param_arr)); 4982 4983 (void) icmp_param_register(&is->is_nd, 4984 is->is_param_arr, A_CNT(icmp_param_arr)); 4985 is->is_ksp = rawip_kstat_init(stackid); 4986 4987 major = mod_name_to_major(INET_NAME); 4988 error = ldi_ident_from_major(major, &is->is_ldi_ident); 4989 ASSERT(error == 0); 4990 return (is); 4991 } 4992 4993 /* 4994 * Free the ICMP stack instance. 4995 */ 4996 static void 4997 rawip_stack_fini(netstackid_t stackid, void *arg) 4998 { 4999 icmp_stack_t *is = (icmp_stack_t *)arg; 5000 5001 nd_free(&is->is_nd); 5002 kmem_free(is->is_param_arr, sizeof (icmp_param_arr)); 5003 is->is_param_arr = NULL; 5004 5005 rawip_kstat_fini(stackid, is->is_ksp); 5006 is->is_ksp = NULL; 5007 ldi_ident_release(is->is_ldi_ident); 5008 kmem_free(is, sizeof (*is)); 5009 } 5010 5011 static void * 5012 rawip_kstat_init(netstackid_t stackid) { 5013 kstat_t *ksp; 5014 5015 rawip_named_kstat_t template = { 5016 { "inDatagrams", KSTAT_DATA_UINT32, 0 }, 5017 { "inCksumErrs", KSTAT_DATA_UINT32, 0 }, 5018 { "inErrors", KSTAT_DATA_UINT32, 0 }, 5019 { "outDatagrams", KSTAT_DATA_UINT32, 0 }, 5020 { "outErrors", KSTAT_DATA_UINT32, 0 }, 5021 }; 5022 5023 ksp = kstat_create_netstack("icmp", 0, "rawip", "mib2", 5024 KSTAT_TYPE_NAMED, 5025 NUM_OF_FIELDS(rawip_named_kstat_t), 5026 0, stackid); 5027 if (ksp == NULL || ksp->ks_data == NULL) 5028 return (NULL); 5029 5030 bcopy(&template, ksp->ks_data, sizeof (template)); 5031 ksp->ks_update = rawip_kstat_update; 5032 ksp->ks_private = (void *)(uintptr_t)stackid; 5033 5034 kstat_install(ksp); 5035 return (ksp); 5036 } 5037 5038 static void 5039 rawip_kstat_fini(netstackid_t stackid, kstat_t *ksp) 5040 { 5041 if (ksp != NULL) { 5042 ASSERT(stackid == (netstackid_t)(uintptr_t)ksp->ks_private); 5043 kstat_delete_netstack(ksp, stackid); 5044 } 5045 } 5046 5047 static int 5048 rawip_kstat_update(kstat_t *ksp, int rw) 5049 { 5050 rawip_named_kstat_t *rawipkp; 5051 netstackid_t stackid = (netstackid_t)(uintptr_t)ksp->ks_private; 5052 netstack_t *ns; 5053 icmp_stack_t *is; 5054 5055 if ((ksp == NULL) || (ksp->ks_data == NULL)) 5056 return (EIO); 5057 5058 if (rw == KSTAT_WRITE) 5059 return (EACCES); 5060 5061 rawipkp = (rawip_named_kstat_t *)ksp->ks_data; 5062 5063 ns = netstack_find_by_stackid(stackid); 5064 if (ns == NULL) 5065 return (-1); 5066 is = ns->netstack_icmp; 5067 if (is == NULL) { 5068 netstack_rele(ns); 5069 return (-1); 5070 } 5071 rawipkp->inDatagrams.value.ui32 = is->is_rawip_mib.rawipInDatagrams; 5072 rawipkp->inCksumErrs.value.ui32 = is->is_rawip_mib.rawipInCksumErrs; 5073 rawipkp->inErrors.value.ui32 = is->is_rawip_mib.rawipInErrors; 5074 rawipkp->outDatagrams.value.ui32 = is->is_rawip_mib.rawipOutDatagrams; 5075 rawipkp->outErrors.value.ui32 = is->is_rawip_mib.rawipOutErrors; 5076 netstack_rele(ns); 5077 return (0); 5078 } 5079 5080 /* ARGSUSED */ 5081 int 5082 rawip_accept(sock_lower_handle_t lproto_handle, 5083 sock_lower_handle_t eproto_handle, sock_upper_handle_t sock_handle, 5084 cred_t *cr) 5085 { 5086 return (EOPNOTSUPP); 5087 } 5088 5089 /* ARGSUSED */ 5090 int 5091 rawip_bind(sock_lower_handle_t proto_handle, struct sockaddr *sa, 5092 socklen_t len, cred_t *cr) 5093 { 5094 conn_t *connp = (conn_t *)proto_handle; 5095 int error; 5096 5097 /* All Solaris components should pass a cred for this operation. */ 5098 ASSERT(cr != NULL); 5099 5100 /* Binding to a NULL address really means unbind */ 5101 if (sa == NULL) 5102 error = rawip_do_unbind(connp); 5103 else 5104 error = rawip_do_bind(connp, sa, len); 5105 5106 if (error < 0) { 5107 if (error == -TOUTSTATE) 5108 error = EINVAL; 5109 else 5110 error = proto_tlitosyserr(-error); 5111 } 5112 return (error); 5113 } 5114 5115 static int 5116 rawip_implicit_bind(conn_t *connp) 5117 { 5118 sin6_t sin6addr; 5119 sin_t *sin; 5120 sin6_t *sin6; 5121 socklen_t len; 5122 int error; 5123 5124 if (connp->conn_family == AF_INET) { 5125 len = sizeof (struct sockaddr_in); 5126 sin = (sin_t *)&sin6addr; 5127 *sin = sin_null; 5128 sin->sin_family = AF_INET; 5129 sin->sin_addr.s_addr = INADDR_ANY; 5130 } else { 5131 ASSERT(connp->conn_family == AF_INET6); 5132 len = sizeof (sin6_t); 5133 sin6 = (sin6_t *)&sin6addr; 5134 *sin6 = sin6_null; 5135 sin6->sin6_family = AF_INET6; 5136 V6_SET_ZERO(sin6->sin6_addr); 5137 } 5138 5139 error = rawip_do_bind(connp, (struct sockaddr *)&sin6addr, len); 5140 5141 return ((error < 0) ? proto_tlitosyserr(-error) : error); 5142 } 5143 5144 static int 5145 rawip_unbind(conn_t *connp) 5146 { 5147 int error; 5148 5149 error = rawip_do_unbind(connp); 5150 if (error < 0) { 5151 error = proto_tlitosyserr(-error); 5152 } 5153 return (error); 5154 } 5155 5156 /* ARGSUSED */ 5157 int 5158 rawip_listen(sock_lower_handle_t proto_handle, int backlog, cred_t *cr) 5159 { 5160 return (EOPNOTSUPP); 5161 } 5162 5163 int 5164 rawip_connect(sock_lower_handle_t proto_handle, const struct sockaddr *sa, 5165 socklen_t len, sock_connid_t *id, cred_t *cr) 5166 { 5167 conn_t *connp = (conn_t *)proto_handle; 5168 icmp_t *icmp = connp->conn_icmp; 5169 int error; 5170 boolean_t did_bind = B_FALSE; 5171 pid_t pid = curproc->p_pid; 5172 5173 /* All Solaris components should pass a cred for this operation. */ 5174 ASSERT(cr != NULL); 5175 5176 if (sa == NULL) { 5177 /* 5178 * Disconnect 5179 * Make sure we are connected 5180 */ 5181 if (icmp->icmp_state != TS_DATA_XFER) 5182 return (EINVAL); 5183 5184 error = icmp_disconnect(connp); 5185 return (error); 5186 } 5187 5188 error = proto_verify_ip_addr(connp->conn_family, sa, len); 5189 if (error != 0) 5190 return (error); 5191 5192 /* do an implicit bind if necessary */ 5193 if (icmp->icmp_state == TS_UNBND) { 5194 error = rawip_implicit_bind(connp); 5195 /* 5196 * We could be racing with an actual bind, in which case 5197 * we would see EPROTO. We cross our fingers and try 5198 * to connect. 5199 */ 5200 if (!(error == 0 || error == EPROTO)) 5201 return (error); 5202 did_bind = B_TRUE; 5203 } 5204 5205 /* 5206 * set SO_DGRAM_ERRIND 5207 */ 5208 connp->conn_dgram_errind = B_TRUE; 5209 5210 error = rawip_do_connect(connp, sa, len, cr, pid); 5211 if (error != 0 && did_bind) { 5212 int unbind_err; 5213 5214 unbind_err = rawip_unbind(connp); 5215 ASSERT(unbind_err == 0); 5216 } 5217 5218 if (error == 0) { 5219 *id = 0; 5220 (*connp->conn_upcalls->su_connected)(connp->conn_upper_handle, 5221 0, NULL, -1); 5222 } else if (error < 0) { 5223 error = proto_tlitosyserr(-error); 5224 } 5225 return (error); 5226 } 5227 5228 /* ARGSUSED2 */ 5229 int 5230 rawip_fallback(sock_lower_handle_t proto_handle, queue_t *q, 5231 boolean_t direct_sockfs, so_proto_quiesced_cb_t quiesced_cb) 5232 { 5233 conn_t *connp = (conn_t *)proto_handle; 5234 icmp_t *icmp; 5235 struct T_capability_ack tca; 5236 struct sockaddr_in6 laddr, faddr; 5237 socklen_t laddrlen, faddrlen; 5238 short opts; 5239 struct stroptions *stropt; 5240 mblk_t *stropt_mp; 5241 int error; 5242 5243 icmp = connp->conn_icmp; 5244 5245 stropt_mp = allocb_wait(sizeof (*stropt), BPRI_HI, STR_NOSIG, NULL); 5246 5247 /* 5248 * setup the fallback stream that was allocated 5249 */ 5250 connp->conn_dev = (dev_t)RD(q)->q_ptr; 5251 connp->conn_minor_arena = WR(q)->q_ptr; 5252 5253 RD(q)->q_ptr = WR(q)->q_ptr = connp; 5254 5255 WR(q)->q_qinfo = &icmpwinit; 5256 5257 connp->conn_rq = RD(q); 5258 connp->conn_wq = WR(q); 5259 5260 /* Notify stream head about options before sending up data */ 5261 stropt_mp->b_datap->db_type = M_SETOPTS; 5262 stropt_mp->b_wptr += sizeof (*stropt); 5263 stropt = (struct stroptions *)stropt_mp->b_rptr; 5264 stropt->so_flags = SO_WROFF | SO_HIWAT; 5265 stropt->so_wroff = connp->conn_wroff; 5266 stropt->so_hiwat = connp->conn_rcvbuf; 5267 putnext(RD(q), stropt_mp); 5268 5269 /* 5270 * free helper stream 5271 */ 5272 ip_free_helper_stream(connp); 5273 5274 /* 5275 * Collect the information needed to sync with the sonode 5276 */ 5277 icmp_do_capability_ack(icmp, &tca, TC1_INFO); 5278 5279 laddrlen = faddrlen = sizeof (sin6_t); 5280 (void) rawip_getsockname((sock_lower_handle_t)connp, 5281 (struct sockaddr *)&laddr, &laddrlen, CRED()); 5282 error = rawip_getpeername((sock_lower_handle_t)connp, 5283 (struct sockaddr *)&faddr, &faddrlen, CRED()); 5284 if (error != 0) 5285 faddrlen = 0; 5286 opts = 0; 5287 if (connp->conn_dgram_errind) 5288 opts |= SO_DGRAM_ERRIND; 5289 if (connp->conn_ixa->ixa_flags & IXAF_DONTROUTE) 5290 opts |= SO_DONTROUTE; 5291 5292 (*quiesced_cb)(connp->conn_upper_handle, q, &tca, 5293 (struct sockaddr *)&laddr, laddrlen, 5294 (struct sockaddr *)&faddr, faddrlen, opts); 5295 5296 /* 5297 * Attempts to send data up during fallback will result in it being 5298 * queued in icmp_t. Now we push up any queued packets. 5299 */ 5300 mutex_enter(&icmp->icmp_recv_lock); 5301 while (icmp->icmp_fallback_queue_head != NULL) { 5302 mblk_t *mp; 5303 5304 mp = icmp->icmp_fallback_queue_head; 5305 icmp->icmp_fallback_queue_head = mp->b_next; 5306 mp->b_next = NULL; 5307 mutex_exit(&icmp->icmp_recv_lock); 5308 putnext(RD(q), mp); 5309 mutex_enter(&icmp->icmp_recv_lock); 5310 } 5311 icmp->icmp_fallback_queue_tail = icmp->icmp_fallback_queue_head; 5312 5313 /* 5314 * No longer a streams less socket 5315 */ 5316 mutex_enter(&connp->conn_lock); 5317 connp->conn_flags &= ~IPCL_NONSTR; 5318 mutex_exit(&connp->conn_lock); 5319 5320 mutex_exit(&icmp->icmp_recv_lock); 5321 5322 ASSERT(icmp->icmp_fallback_queue_head == NULL && 5323 icmp->icmp_fallback_queue_tail == NULL); 5324 5325 ASSERT(connp->conn_ref >= 1); 5326 5327 return (0); 5328 } 5329 5330 /* ARGSUSED2 */ 5331 sock_lower_handle_t 5332 rawip_create(int family, int type, int proto, sock_downcalls_t **sock_downcalls, 5333 uint_t *smodep, int *errorp, int flags, cred_t *credp) 5334 { 5335 conn_t *connp; 5336 5337 if (type != SOCK_RAW || (family != AF_INET && family != AF_INET6)) { 5338 *errorp = EPROTONOSUPPORT; 5339 return (NULL); 5340 } 5341 5342 connp = rawip_do_open(family, credp, errorp, flags); 5343 if (connp != NULL) { 5344 connp->conn_flags |= IPCL_NONSTR; 5345 5346 mutex_enter(&connp->conn_lock); 5347 connp->conn_state_flags &= ~CONN_INCIPIENT; 5348 mutex_exit(&connp->conn_lock); 5349 *sock_downcalls = &sock_rawip_downcalls; 5350 *smodep = SM_ATOMIC; 5351 } else { 5352 ASSERT(*errorp != 0); 5353 } 5354 5355 return ((sock_lower_handle_t)connp); 5356 } 5357 5358 /* ARGSUSED3 */ 5359 void 5360 rawip_activate(sock_lower_handle_t proto_handle, 5361 sock_upper_handle_t sock_handle, sock_upcalls_t *sock_upcalls, int flags, 5362 cred_t *cr) 5363 { 5364 conn_t *connp = (conn_t *)proto_handle; 5365 struct sock_proto_props sopp; 5366 5367 /* All Solaris components should pass a cred for this operation. */ 5368 ASSERT(cr != NULL); 5369 5370 connp->conn_upcalls = sock_upcalls; 5371 connp->conn_upper_handle = sock_handle; 5372 5373 sopp.sopp_flags = SOCKOPT_WROFF | SOCKOPT_RCVHIWAT | SOCKOPT_RCVLOWAT | 5374 SOCKOPT_MAXBLK | SOCKOPT_MAXPSZ | SOCKOPT_MINPSZ; 5375 sopp.sopp_wroff = connp->conn_wroff; 5376 sopp.sopp_rxhiwat = connp->conn_rcvbuf; 5377 sopp.sopp_rxlowat = connp->conn_rcvlowat; 5378 sopp.sopp_maxblk = INFPSZ; 5379 sopp.sopp_maxpsz = IP_MAXPACKET; 5380 sopp.sopp_minpsz = (icmp_mod_info.mi_minpsz == 1) ? 0 : 5381 icmp_mod_info.mi_minpsz; 5382 5383 (*connp->conn_upcalls->su_set_proto_props) 5384 (connp->conn_upper_handle, &sopp); 5385 5386 icmp_bind_proto(connp->conn_icmp); 5387 } 5388 5389 /* ARGSUSED3 */ 5390 int 5391 rawip_getpeername(sock_lower_handle_t proto_handle, struct sockaddr *sa, 5392 socklen_t *salenp, cred_t *cr) 5393 { 5394 conn_t *connp = (conn_t *)proto_handle; 5395 icmp_t *icmp = connp->conn_icmp; 5396 int error; 5397 5398 /* All Solaris components should pass a cred for this operation. */ 5399 ASSERT(cr != NULL); 5400 5401 mutex_enter(&connp->conn_lock); 5402 if (icmp->icmp_state != TS_DATA_XFER) 5403 error = ENOTCONN; 5404 else 5405 error = conn_getpeername(connp, sa, salenp); 5406 mutex_exit(&connp->conn_lock); 5407 return (error); 5408 } 5409 5410 /* ARGSUSED3 */ 5411 int 5412 rawip_getsockname(sock_lower_handle_t proto_handle, struct sockaddr *sa, 5413 socklen_t *salenp, cred_t *cr) 5414 { 5415 conn_t *connp = (conn_t *)proto_handle; 5416 int error; 5417 5418 /* All Solaris components should pass a cred for this operation. */ 5419 ASSERT(cr != NULL); 5420 5421 mutex_enter(&connp->conn_lock); 5422 error = conn_getsockname(connp, sa, salenp); 5423 mutex_exit(&connp->conn_lock); 5424 return (error); 5425 } 5426 5427 int 5428 rawip_setsockopt(sock_lower_handle_t proto_handle, int level, int option_name, 5429 const void *optvalp, socklen_t optlen, cred_t *cr) 5430 { 5431 conn_t *connp = (conn_t *)proto_handle; 5432 int error; 5433 5434 /* All Solaris components should pass a cred for this operation. */ 5435 ASSERT(cr != NULL); 5436 5437 error = proto_opt_check(level, option_name, optlen, NULL, 5438 icmp_opt_obj.odb_opt_des_arr, 5439 icmp_opt_obj.odb_opt_arr_cnt, 5440 B_TRUE, B_FALSE, cr); 5441 5442 if (error != 0) { 5443 /* 5444 * option not recognized 5445 */ 5446 if (error < 0) { 5447 error = proto_tlitosyserr(-error); 5448 } 5449 return (error); 5450 } 5451 5452 error = icmp_opt_set(connp, SETFN_OPTCOM_NEGOTIATE, level, 5453 option_name, optlen, (uchar_t *)optvalp, (uint_t *)&optlen, 5454 (uchar_t *)optvalp, NULL, cr); 5455 5456 ASSERT(error >= 0); 5457 5458 return (error); 5459 } 5460 5461 int 5462 rawip_getsockopt(sock_lower_handle_t proto_handle, int level, int option_name, 5463 void *optvalp, socklen_t *optlen, cred_t *cr) 5464 { 5465 int error; 5466 conn_t *connp = (conn_t *)proto_handle; 5467 t_uscalar_t max_optbuf_len; 5468 void *optvalp_buf; 5469 int len; 5470 5471 /* All Solaris components should pass a cred for this operation. */ 5472 ASSERT(cr != NULL); 5473 5474 error = proto_opt_check(level, option_name, *optlen, &max_optbuf_len, 5475 icmp_opt_obj.odb_opt_des_arr, 5476 icmp_opt_obj.odb_opt_arr_cnt, 5477 B_FALSE, B_TRUE, cr); 5478 5479 if (error != 0) { 5480 if (error < 0) { 5481 error = proto_tlitosyserr(-error); 5482 } 5483 return (error); 5484 } 5485 5486 optvalp_buf = kmem_alloc(max_optbuf_len, KM_SLEEP); 5487 len = icmp_opt_get(connp, level, option_name, optvalp_buf); 5488 if (len == -1) { 5489 kmem_free(optvalp_buf, max_optbuf_len); 5490 return (EINVAL); 5491 } 5492 5493 /* 5494 * update optlen and copy option value 5495 */ 5496 t_uscalar_t size = MIN(len, *optlen); 5497 5498 bcopy(optvalp_buf, optvalp, size); 5499 bcopy(&size, optlen, sizeof (size)); 5500 5501 kmem_free(optvalp_buf, max_optbuf_len); 5502 return (0); 5503 } 5504 5505 /* ARGSUSED1 */ 5506 int 5507 rawip_close(sock_lower_handle_t proto_handle, int flags, cred_t *cr) 5508 { 5509 conn_t *connp = (conn_t *)proto_handle; 5510 5511 /* All Solaris components should pass a cred for this operation. */ 5512 ASSERT(cr != NULL); 5513 5514 (void) rawip_do_close(connp); 5515 return (0); 5516 } 5517 5518 /* ARGSUSED2 */ 5519 int 5520 rawip_shutdown(sock_lower_handle_t proto_handle, int how, cred_t *cr) 5521 { 5522 conn_t *connp = (conn_t *)proto_handle; 5523 5524 /* All Solaris components should pass a cred for this operation. */ 5525 ASSERT(cr != NULL); 5526 5527 /* shut down the send side */ 5528 if (how != SHUT_RD) 5529 (*connp->conn_upcalls->su_opctl)(connp->conn_upper_handle, 5530 SOCK_OPCTL_SHUT_SEND, 0); 5531 /* shut down the recv side */ 5532 if (how != SHUT_WR) 5533 (*connp->conn_upcalls->su_opctl)(connp->conn_upper_handle, 5534 SOCK_OPCTL_SHUT_RECV, 0); 5535 return (0); 5536 } 5537 5538 void 5539 rawip_clr_flowctrl(sock_lower_handle_t proto_handle) 5540 { 5541 conn_t *connp = (conn_t *)proto_handle; 5542 icmp_t *icmp = connp->conn_icmp; 5543 5544 mutex_enter(&icmp->icmp_recv_lock); 5545 connp->conn_flow_cntrld = B_FALSE; 5546 mutex_exit(&icmp->icmp_recv_lock); 5547 } 5548 5549 int 5550 rawip_ioctl(sock_lower_handle_t proto_handle, int cmd, intptr_t arg, 5551 int mode, int32_t *rvalp, cred_t *cr) 5552 { 5553 conn_t *connp = (conn_t *)proto_handle; 5554 int error; 5555 5556 /* All Solaris components should pass a cred for this operation. */ 5557 ASSERT(cr != NULL); 5558 5559 /* 5560 * If we don't have a helper stream then create one. 5561 * ip_create_helper_stream takes care of locking the conn_t, 5562 * so this check for NULL is just a performance optimization. 5563 */ 5564 if (connp->conn_helper_info == NULL) { 5565 icmp_stack_t *is = connp->conn_icmp->icmp_is; 5566 5567 ASSERT(is->is_ldi_ident != NULL); 5568 5569 /* 5570 * Create a helper stream for non-STREAMS socket. 5571 */ 5572 error = ip_create_helper_stream(connp, is->is_ldi_ident); 5573 if (error != 0) { 5574 ip0dbg(("rawip_ioctl: create of IP helper stream " 5575 "failed %d\n", error)); 5576 return (error); 5577 } 5578 } 5579 5580 switch (cmd) { 5581 case ND_SET: 5582 case ND_GET: 5583 case _SIOCSOCKFALLBACK: 5584 case TI_GETPEERNAME: 5585 case TI_GETMYNAME: 5586 #ifdef DEBUG 5587 cmn_err(CE_CONT, "icmp_ioctl cmd 0x%x on non streams" 5588 " socket", cmd); 5589 #endif 5590 error = EINVAL; 5591 break; 5592 default: 5593 /* 5594 * Pass on to IP using helper stream 5595 */ 5596 error = ldi_ioctl(connp->conn_helper_info->iphs_handle, 5597 cmd, arg, mode, cr, rvalp); 5598 break; 5599 } 5600 return (error); 5601 } 5602 5603 int 5604 rawip_send(sock_lower_handle_t proto_handle, mblk_t *mp, struct nmsghdr *msg, 5605 cred_t *cr) 5606 { 5607 sin6_t *sin6; 5608 sin_t *sin = NULL; 5609 uint_t srcid; 5610 conn_t *connp = (conn_t *)proto_handle; 5611 icmp_t *icmp = connp->conn_icmp; 5612 int error = 0; 5613 icmp_stack_t *is = icmp->icmp_is; 5614 pid_t pid = curproc->p_pid; 5615 ip_xmit_attr_t *ixa; 5616 5617 ASSERT(DB_TYPE(mp) == M_DATA); 5618 5619 /* All Solaris components should pass a cred for this operation. */ 5620 ASSERT(cr != NULL); 5621 5622 /* do an implicit bind if necessary */ 5623 if (icmp->icmp_state == TS_UNBND) { 5624 error = rawip_implicit_bind(connp); 5625 /* 5626 * We could be racing with an actual bind, in which case 5627 * we would see EPROTO. We cross our fingers and try 5628 * to connect. 5629 */ 5630 if (!(error == 0 || error == EPROTO)) { 5631 freemsg(mp); 5632 return (error); 5633 } 5634 } 5635 5636 /* Protocol 255 contains full IP headers */ 5637 /* Read without holding lock */ 5638 if (icmp->icmp_hdrincl) { 5639 ASSERT(connp->conn_ipversion == IPV4_VERSION); 5640 if (mp->b_wptr - mp->b_rptr < IP_SIMPLE_HDR_LENGTH) { 5641 if (!pullupmsg(mp, IP_SIMPLE_HDR_LENGTH)) { 5642 BUMP_MIB(&is->is_rawip_mib, rawipOutErrors); 5643 freemsg(mp); 5644 return (EINVAL); 5645 } 5646 } 5647 error = icmp_output_hdrincl(connp, mp, cr, pid); 5648 if (is->is_sendto_ignerr) 5649 return (0); 5650 else 5651 return (error); 5652 } 5653 5654 /* Connected? */ 5655 if (msg->msg_name == NULL) { 5656 if (icmp->icmp_state != TS_DATA_XFER) { 5657 BUMP_MIB(&is->is_rawip_mib, rawipOutErrors); 5658 return (EDESTADDRREQ); 5659 } 5660 if (msg->msg_controllen != 0) { 5661 error = icmp_output_ancillary(connp, NULL, NULL, mp, 5662 NULL, msg, cr, pid); 5663 } else { 5664 error = icmp_output_connected(connp, mp, cr, pid); 5665 } 5666 if (is->is_sendto_ignerr) 5667 return (0); 5668 else 5669 return (error); 5670 } 5671 if (icmp->icmp_state == TS_DATA_XFER) { 5672 BUMP_MIB(&is->is_rawip_mib, rawipOutErrors); 5673 return (EISCONN); 5674 } 5675 error = proto_verify_ip_addr(connp->conn_family, 5676 (struct sockaddr *)msg->msg_name, msg->msg_namelen); 5677 if (error != 0) { 5678 BUMP_MIB(&is->is_rawip_mib, rawipOutErrors); 5679 return (error); 5680 } 5681 switch (connp->conn_family) { 5682 case AF_INET6: 5683 sin6 = (sin6_t *)msg->msg_name; 5684 5685 /* No support for mapped addresses on raw sockets */ 5686 if (IN6_IS_ADDR_V4MAPPED(&sin6->sin6_addr)) { 5687 BUMP_MIB(&is->is_rawip_mib, rawipOutErrors); 5688 return (EADDRNOTAVAIL); 5689 } 5690 srcid = sin6->__sin6_src_id; 5691 5692 /* 5693 * If the local address is a mapped address return 5694 * an error. 5695 * It would be possible to send an IPv6 packet but the 5696 * response would never make it back to the application 5697 * since it is bound to a mapped address. 5698 */ 5699 if (IN6_IS_ADDR_V4MAPPED(&connp->conn_saddr_v6)) { 5700 BUMP_MIB(&is->is_rawip_mib, rawipOutErrors); 5701 return (EADDRNOTAVAIL); 5702 } 5703 5704 if (IN6_IS_ADDR_UNSPECIFIED(&sin6->sin6_addr)) 5705 sin6->sin6_addr = ipv6_loopback; 5706 5707 /* 5708 * We have to allocate an ip_xmit_attr_t before we grab 5709 * conn_lock and we need to hold conn_lock once we've check 5710 * conn_same_as_last_v6 to handle concurrent send* calls on a 5711 * socket. 5712 */ 5713 if (msg->msg_controllen == 0) { 5714 ixa = conn_get_ixa(connp, B_FALSE); 5715 if (ixa == NULL) { 5716 BUMP_MIB(&is->is_rawip_mib, rawipOutErrors); 5717 return (ENOMEM); 5718 } 5719 } else { 5720 ixa = NULL; 5721 } 5722 mutex_enter(&connp->conn_lock); 5723 if (icmp->icmp_delayed_error != 0) { 5724 sin6_t *sin2 = (sin6_t *)&icmp->icmp_delayed_addr; 5725 5726 error = icmp->icmp_delayed_error; 5727 icmp->icmp_delayed_error = 0; 5728 5729 /* Compare IP address and family */ 5730 5731 if (IN6_ARE_ADDR_EQUAL(&sin6->sin6_addr, 5732 &sin2->sin6_addr) && 5733 sin6->sin6_family == sin2->sin6_family) { 5734 mutex_exit(&connp->conn_lock); 5735 BUMP_MIB(&is->is_rawip_mib, rawipOutErrors); 5736 if (ixa != NULL) 5737 ixa_refrele(ixa); 5738 return (error); 5739 } 5740 } 5741 if (msg->msg_controllen != 0) { 5742 mutex_exit(&connp->conn_lock); 5743 ASSERT(ixa == NULL); 5744 error = icmp_output_ancillary(connp, NULL, sin6, mp, 5745 NULL, msg, cr, pid); 5746 } else if (conn_same_as_last_v6(connp, sin6) && 5747 connp->conn_lastsrcid == srcid && 5748 ipsec_outbound_policy_current(ixa)) { 5749 /* icmp_output_lastdst drops conn_lock */ 5750 error = icmp_output_lastdst(connp, mp, cr, pid, ixa); 5751 } else { 5752 /* icmp_output_newdst drops conn_lock */ 5753 error = icmp_output_newdst(connp, mp, NULL, sin6, cr, 5754 pid, ixa); 5755 } 5756 ASSERT(MUTEX_NOT_HELD(&connp->conn_lock)); 5757 if (is->is_sendto_ignerr) 5758 return (0); 5759 else 5760 return (error); 5761 case AF_INET: 5762 sin = (sin_t *)msg->msg_name; 5763 5764 if (sin->sin_addr.s_addr == INADDR_ANY) 5765 sin->sin_addr.s_addr = htonl(INADDR_LOOPBACK); 5766 5767 /* 5768 * We have to allocate an ip_xmit_attr_t before we grab 5769 * conn_lock and we need to hold conn_lock once we've check 5770 * conn_same_as_last_v6 to handle concurrent send* on a socket. 5771 */ 5772 if (msg->msg_controllen == 0) { 5773 ixa = conn_get_ixa(connp, B_FALSE); 5774 if (ixa == NULL) { 5775 BUMP_MIB(&is->is_rawip_mib, rawipOutErrors); 5776 return (ENOMEM); 5777 } 5778 } else { 5779 ixa = NULL; 5780 } 5781 mutex_enter(&connp->conn_lock); 5782 if (icmp->icmp_delayed_error != 0) { 5783 sin_t *sin2 = (sin_t *)&icmp->icmp_delayed_addr; 5784 5785 error = icmp->icmp_delayed_error; 5786 icmp->icmp_delayed_error = 0; 5787 5788 /* Compare IP address */ 5789 5790 if (sin->sin_addr.s_addr == sin2->sin_addr.s_addr) { 5791 mutex_exit(&connp->conn_lock); 5792 BUMP_MIB(&is->is_rawip_mib, rawipOutErrors); 5793 if (ixa != NULL) 5794 ixa_refrele(ixa); 5795 return (error); 5796 } 5797 } 5798 5799 if (msg->msg_controllen != 0) { 5800 mutex_exit(&connp->conn_lock); 5801 ASSERT(ixa == NULL); 5802 error = icmp_output_ancillary(connp, sin, NULL, mp, 5803 NULL, msg, cr, pid); 5804 } else if (conn_same_as_last_v4(connp, sin) && 5805 ipsec_outbound_policy_current(ixa)) { 5806 /* icmp_output_lastdst drops conn_lock */ 5807 error = icmp_output_lastdst(connp, mp, cr, pid, ixa); 5808 } else { 5809 /* icmp_output_newdst drops conn_lock */ 5810 error = icmp_output_newdst(connp, mp, sin, NULL, cr, 5811 pid, ixa); 5812 } 5813 ASSERT(MUTEX_NOT_HELD(&connp->conn_lock)); 5814 if (is->is_sendto_ignerr) 5815 return (0); 5816 else 5817 return (error); 5818 default: 5819 return (EINVAL); 5820 } 5821 } 5822 5823 sock_downcalls_t sock_rawip_downcalls = { 5824 rawip_activate, 5825 rawip_accept, 5826 rawip_bind, 5827 rawip_listen, 5828 rawip_connect, 5829 rawip_getpeername, 5830 rawip_getsockname, 5831 rawip_getsockopt, 5832 rawip_setsockopt, 5833 rawip_send, 5834 NULL, 5835 NULL, 5836 NULL, 5837 rawip_shutdown, 5838 rawip_clr_flowctrl, 5839 rawip_ioctl, 5840 rawip_close 5841 }; 5842