1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved. 23 */ 24 25 #include <sys/types.h> 26 #include <sys/stream.h> 27 #define _SUN_TPI_VERSION 2 28 #include <sys/tihdr.h> 29 #include <sys/socket.h> 30 #include <sys/xti_xtiopt.h> 31 #include <sys/xti_inet.h> 32 #include <sys/policy.h> 33 34 #include <inet/common.h> 35 #include <netinet/ip6.h> 36 #include <inet/ip.h> 37 38 #include <netinet/in.h> 39 #include <netinet/tcp.h> 40 #include <inet/optcom.h> 41 #include <inet/proto_set.h> 42 #include <inet/tcp_impl.h> 43 44 /* 45 * Table of all known options handled on a TCP protocol stack. 46 * 47 * Note: This table contains options processed by both TCP and IP levels 48 * and is the superset of options that can be performed on a TCP over IP 49 * stack. 50 */ 51 opdes_t tcp_opt_arr[] = { 52 53 { SO_LINGER, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, 54 sizeof (struct linger), 0 }, 55 56 { SO_DEBUG, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 }, 57 { SO_KEEPALIVE, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 }, 58 { SO_DONTROUTE, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 }, 59 { SO_USELOOPBACK, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 60 }, 61 { SO_BROADCAST, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 }, 62 { SO_REUSEADDR, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 }, 63 { SO_OOBINLINE, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 }, 64 { SO_TYPE, SOL_SOCKET, OA_R, OA_R, OP_NP, 0, sizeof (int), 0 }, 65 { SO_SNDBUF, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 }, 66 { SO_RCVBUF, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 }, 67 { SO_SNDTIMEO, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, 68 sizeof (struct timeval), 0 }, 69 { SO_RCVTIMEO, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, 70 sizeof (struct timeval), 0 }, 71 { SO_DGRAM_ERRIND, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 72 }, 73 { SO_SND_COPYAVOID, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 }, 74 { SO_ANON_MLP, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 75 0 }, 76 { SO_MAC_EXEMPT, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 77 0 }, 78 { SO_MAC_IMPLICIT, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 79 0 }, 80 { SO_ALLZONES, SOL_SOCKET, OA_R, OA_RW, OP_CONFIG, 0, sizeof (int), 81 0 }, 82 { SO_EXCLBIND, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 }, 83 84 { SO_DOMAIN, SOL_SOCKET, OA_R, OA_R, OP_NP, 0, sizeof (int), 0 }, 85 86 { SO_PROTOTYPE, SOL_SOCKET, OA_R, OA_R, OP_NP, 0, sizeof (int), 0 }, 87 88 { TCP_NODELAY, IPPROTO_TCP, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 89 }, 90 { TCP_MAXSEG, IPPROTO_TCP, OA_R, OA_R, OP_NP, 0, sizeof (uint_t), 91 536 }, 92 93 { TCP_NOTIFY_THRESHOLD, IPPROTO_TCP, OA_RW, OA_RW, OP_NP, 94 OP_DEF_FN, sizeof (int), -1 /* not initialized */ }, 95 96 { TCP_ABORT_THRESHOLD, IPPROTO_TCP, OA_RW, OA_RW, OP_NP, 97 OP_DEF_FN, sizeof (int), -1 /* not initialized */ }, 98 99 { TCP_CONN_NOTIFY_THRESHOLD, IPPROTO_TCP, OA_RW, OA_RW, OP_NP, 100 OP_DEF_FN, sizeof (int), -1 /* not initialized */ }, 101 102 { TCP_CONN_ABORT_THRESHOLD, IPPROTO_TCP, OA_RW, OA_RW, OP_NP, 103 OP_DEF_FN, sizeof (int), -1 /* not initialized */ }, 104 105 { TCP_RECVDSTADDR, IPPROTO_TCP, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 106 0 }, 107 108 { TCP_ANONPRIVBIND, IPPROTO_TCP, OA_R, OA_RW, OP_PRIVPORT, 0, 109 sizeof (int), 0 }, 110 111 { TCP_EXCLBIND, IPPROTO_TCP, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 112 }, 113 114 { TCP_INIT_CWND, IPPROTO_TCP, OA_RW, OA_RW, OP_CONFIG, 0, 115 sizeof (int), 0 }, 116 117 { TCP_KEEPALIVE_THRESHOLD, IPPROTO_TCP, OA_RW, OA_RW, OP_NP, 0, 118 sizeof (int), 0 }, 119 120 { TCP_KEEPALIVE_ABORT_THRESHOLD, IPPROTO_TCP, OA_RW, OA_RW, OP_NP, 0, 121 sizeof (int), 0 }, 122 123 { TCP_CORK, IPPROTO_TCP, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 }, 124 125 { TCP_RTO_INITIAL, IPPROTO_TCP, OA_RW, OA_RW, OP_NP, 0, sizeof (uint32_t), 0 }, 126 127 { TCP_RTO_MIN, IPPROTO_TCP, OA_RW, OA_RW, OP_NP, 0, sizeof (uint32_t), 0 }, 128 129 { TCP_RTO_MAX, IPPROTO_TCP, OA_RW, OA_RW, OP_NP, 0, sizeof (uint32_t), 0 }, 130 131 { TCP_LINGER2, IPPROTO_TCP, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 }, 132 133 { IP_OPTIONS, IPPROTO_IP, OA_RW, OA_RW, OP_NP, 134 (OP_VARLEN|OP_NODEFAULT), 135 IP_MAX_OPT_LENGTH + IP_ADDR_LEN, -1 /* not initialized */ }, 136 { T_IP_OPTIONS, IPPROTO_IP, OA_RW, OA_RW, OP_NP, 137 (OP_VARLEN|OP_NODEFAULT), 138 IP_MAX_OPT_LENGTH + IP_ADDR_LEN, -1 /* not initialized */ }, 139 140 { IP_TOS, IPPROTO_IP, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 }, 141 { T_IP_TOS, IPPROTO_IP, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 }, 142 { IP_TTL, IPPROTO_IP, OA_RW, OA_RW, OP_NP, OP_DEF_FN, 143 sizeof (int), -1 /* not initialized */ }, 144 145 { IP_SEC_OPT, IPPROTO_IP, OA_RW, OA_RW, OP_NP, OP_NODEFAULT, 146 sizeof (ipsec_req_t), -1 /* not initialized */ }, 147 148 { IP_BOUND_IF, IPPROTO_IP, OA_RW, OA_RW, OP_NP, 0, 149 sizeof (int), 0 /* no ifindex */ }, 150 151 { IP_UNSPEC_SRC, IPPROTO_IP, OA_R, OA_RW, OP_RAW, 0, 152 sizeof (int), 0 }, 153 154 { IPV6_UNICAST_HOPS, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, OP_DEF_FN, 155 sizeof (int), -1 /* not initialized */ }, 156 157 { IPV6_BOUND_IF, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 0, 158 sizeof (int), 0 /* no ifindex */ }, 159 160 { IP_DONTFRAG, IPPROTO_IP, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 }, 161 162 { IP_NEXTHOP, IPPROTO_IP, OA_R, OA_RW, OP_CONFIG, 0, 163 sizeof (in_addr_t), -1 /* not initialized */ }, 164 165 { IPV6_UNSPEC_SRC, IPPROTO_IPV6, OA_R, OA_RW, OP_RAW, 0, 166 sizeof (int), 0 }, 167 168 { IPV6_PKTINFO, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 169 (OP_NODEFAULT|OP_VARLEN), 170 sizeof (struct in6_pktinfo), -1 /* not initialized */ }, 171 { IPV6_NEXTHOP, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 172 OP_NODEFAULT, 173 sizeof (sin6_t), -1 /* not initialized */ }, 174 { IPV6_HOPOPTS, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 175 (OP_VARLEN|OP_NODEFAULT), 255*8, 176 -1 /* not initialized */ }, 177 { IPV6_DSTOPTS, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 178 (OP_VARLEN|OP_NODEFAULT), 255*8, 179 -1 /* not initialized */ }, 180 { IPV6_RTHDRDSTOPTS, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 181 (OP_VARLEN|OP_NODEFAULT), 255*8, 182 -1 /* not initialized */ }, 183 { IPV6_RTHDR, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 184 (OP_VARLEN|OP_NODEFAULT), 255*8, 185 -1 /* not initialized */ }, 186 { IPV6_TCLASS, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 187 OP_NODEFAULT, 188 sizeof (int), -1 /* not initialized */ }, 189 { IPV6_PATHMTU, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 190 OP_NODEFAULT, 191 sizeof (struct ip6_mtuinfo), -1 /* not initialized */ }, 192 { IPV6_DONTFRAG, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 0, 193 sizeof (int), 0 }, 194 { IPV6_USE_MIN_MTU, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 0, 195 sizeof (int), 0 }, 196 { IPV6_V6ONLY, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 0, 197 sizeof (int), 0 }, 198 199 /* Enable receipt of ancillary data */ 200 { IPV6_RECVPKTINFO, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 0, 201 sizeof (int), 0 }, 202 { IPV6_RECVHOPLIMIT, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 0, 203 sizeof (int), 0 }, 204 { IPV6_RECVHOPOPTS, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 0, 205 sizeof (int), 0 }, 206 { _OLD_IPV6_RECVDSTOPTS, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 0, 207 sizeof (int), 0 }, 208 { IPV6_RECVDSTOPTS, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 0, 209 sizeof (int), 0 }, 210 { IPV6_RECVRTHDR, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 0, 211 sizeof (int), 0 }, 212 { IPV6_RECVRTHDRDSTOPTS, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 0, 213 sizeof (int), 0 }, 214 { IPV6_RECVTCLASS, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 0, 215 sizeof (int), 0 }, 216 217 { IPV6_SEC_OPT, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, OP_NODEFAULT, 218 sizeof (ipsec_req_t), -1 /* not initialized */ }, 219 { IPV6_SRC_PREFERENCES, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 0, 220 sizeof (uint32_t), IPV6_PREFER_SRC_DEFAULT }, 221 }; 222 223 /* 224 * Table of all supported levels 225 * Note: Some levels (e.g. XTI_GENERIC) may be valid but may not have 226 * any supported options so we need this info separately. 227 * 228 * This is needed only for topmost tpi providers and is used only by 229 * XTI interfaces. 230 */ 231 optlevel_t tcp_valid_levels_arr[] = { 232 XTI_GENERIC, 233 SOL_SOCKET, 234 IPPROTO_TCP, 235 IPPROTO_IP, 236 IPPROTO_IPV6 237 }; 238 239 240 #define TCP_OPT_ARR_CNT A_CNT(tcp_opt_arr) 241 #define TCP_VALID_LEVELS_CNT A_CNT(tcp_valid_levels_arr) 242 243 uint_t tcp_max_optsize; /* initialized when TCP driver is loaded */ 244 245 /* 246 * Initialize option database object for TCP 247 * 248 * This object represents database of options to search passed to 249 * {sock,tpi}optcom_req() interface routine to take care of option 250 * management and associated methods. 251 */ 252 253 optdb_obj_t tcp_opt_obj = { 254 tcp_opt_default, /* TCP default value function pointer */ 255 tcp_tpi_opt_get, /* TCP get function pointer */ 256 tcp_tpi_opt_set, /* TCP set function pointer */ 257 TCP_OPT_ARR_CNT, /* TCP option database count of entries */ 258 tcp_opt_arr, /* TCP option database */ 259 TCP_VALID_LEVELS_CNT, /* TCP valid level count of entries */ 260 tcp_valid_levels_arr /* TCP valid level array */ 261 }; 262 263 /* Maximum TCP initial cwin (start/restart). */ 264 #define TCP_MAX_INIT_CWND 16 265 266 static int tcp_max_init_cwnd = TCP_MAX_INIT_CWND; 267 268 /* 269 * Some TCP options can be "set" by requesting them in the option 270 * buffer. This is needed for XTI feature test though we do not 271 * allow it in general. We interpret that this mechanism is more 272 * applicable to OSI protocols and need not be allowed in general. 273 * This routine filters out options for which it is not allowed (most) 274 * and lets through those (few) for which it is. [ The XTI interface 275 * test suite specifics will imply that any XTI_GENERIC level XTI_* if 276 * ever implemented will have to be allowed here ]. 277 */ 278 static boolean_t 279 tcp_allow_connopt_set(int level, int name) 280 { 281 282 switch (level) { 283 case IPPROTO_TCP: 284 switch (name) { 285 case TCP_NODELAY: 286 return (B_TRUE); 287 default: 288 return (B_FALSE); 289 } 290 /*NOTREACHED*/ 291 default: 292 return (B_FALSE); 293 } 294 /*NOTREACHED*/ 295 } 296 297 /* 298 * This routine gets default values of certain options whose default 299 * values are maintained by protocol specific code 300 */ 301 /* ARGSUSED */ 302 int 303 tcp_opt_default(queue_t *q, int level, int name, uchar_t *ptr) 304 { 305 int32_t *i1 = (int32_t *)ptr; 306 tcp_stack_t *tcps = Q_TO_TCP(q)->tcp_tcps; 307 308 switch (level) { 309 case IPPROTO_TCP: 310 switch (name) { 311 case TCP_NOTIFY_THRESHOLD: 312 *i1 = tcps->tcps_ip_notify_interval; 313 break; 314 case TCP_ABORT_THRESHOLD: 315 *i1 = tcps->tcps_ip_abort_interval; 316 break; 317 case TCP_CONN_NOTIFY_THRESHOLD: 318 *i1 = tcps->tcps_ip_notify_cinterval; 319 break; 320 case TCP_CONN_ABORT_THRESHOLD: 321 *i1 = tcps->tcps_ip_abort_cinterval; 322 break; 323 default: 324 return (-1); 325 } 326 break; 327 case IPPROTO_IP: 328 switch (name) { 329 case IP_TTL: 330 *i1 = tcps->tcps_ipv4_ttl; 331 break; 332 default: 333 return (-1); 334 } 335 break; 336 case IPPROTO_IPV6: 337 switch (name) { 338 case IPV6_UNICAST_HOPS: 339 *i1 = tcps->tcps_ipv6_hoplimit; 340 break; 341 default: 342 return (-1); 343 } 344 break; 345 default: 346 return (-1); 347 } 348 return (sizeof (int)); 349 } 350 351 /* 352 * TCP routine to get the values of options. 353 */ 354 int 355 tcp_opt_get(conn_t *connp, int level, int name, uchar_t *ptr) 356 { 357 int *i1 = (int *)ptr; 358 tcp_t *tcp = connp->conn_tcp; 359 conn_opt_arg_t coas; 360 int retval; 361 362 coas.coa_connp = connp; 363 coas.coa_ixa = connp->conn_ixa; 364 coas.coa_ipp = &connp->conn_xmit_ipp; 365 coas.coa_ancillary = B_FALSE; 366 coas.coa_changed = 0; 367 368 switch (level) { 369 case SOL_SOCKET: 370 switch (name) { 371 case SO_SND_COPYAVOID: 372 *i1 = tcp->tcp_snd_zcopy_on ? 373 SO_SND_COPYAVOID : 0; 374 return (sizeof (int)); 375 case SO_ACCEPTCONN: 376 *i1 = (tcp->tcp_state == TCPS_LISTEN); 377 return (sizeof (int)); 378 } 379 break; 380 case IPPROTO_TCP: 381 switch (name) { 382 case TCP_NODELAY: 383 *i1 = (tcp->tcp_naglim == 1) ? TCP_NODELAY : 0; 384 return (sizeof (int)); 385 case TCP_MAXSEG: 386 *i1 = tcp->tcp_mss; 387 return (sizeof (int)); 388 case TCP_NOTIFY_THRESHOLD: 389 *i1 = (int)tcp->tcp_first_timer_threshold; 390 return (sizeof (int)); 391 case TCP_ABORT_THRESHOLD: 392 *i1 = tcp->tcp_second_timer_threshold; 393 return (sizeof (int)); 394 case TCP_CONN_NOTIFY_THRESHOLD: 395 *i1 = tcp->tcp_first_ctimer_threshold; 396 return (sizeof (int)); 397 case TCP_CONN_ABORT_THRESHOLD: 398 *i1 = tcp->tcp_second_ctimer_threshold; 399 return (sizeof (int)); 400 case TCP_INIT_CWND: 401 *i1 = tcp->tcp_init_cwnd; 402 return (sizeof (int)); 403 case TCP_KEEPALIVE_THRESHOLD: 404 *i1 = tcp->tcp_ka_interval; 405 return (sizeof (int)); 406 case TCP_KEEPALIVE_ABORT_THRESHOLD: 407 *i1 = tcp->tcp_ka_abort_thres; 408 return (sizeof (int)); 409 case TCP_CORK: 410 *i1 = tcp->tcp_cork; 411 return (sizeof (int)); 412 case TCP_RTO_INITIAL: 413 *i1 = tcp->tcp_rto_initial; 414 return (sizeof (uint32_t)); 415 case TCP_RTO_MIN: 416 *i1 = tcp->tcp_rto_min; 417 return (sizeof (uint32_t)); 418 case TCP_RTO_MAX: 419 *i1 = tcp->tcp_rto_max; 420 return (sizeof (uint32_t)); 421 case TCP_LINGER2: 422 *i1 = tcp->tcp_fin_wait_2_flush_interval / SECONDS; 423 return (sizeof (int)); 424 } 425 break; 426 case IPPROTO_IP: 427 if (connp->conn_family != AF_INET) 428 return (-1); 429 switch (name) { 430 case IP_OPTIONS: 431 case T_IP_OPTIONS: 432 /* Caller ensures enough space */ 433 return (ip_opt_get_user(connp, ptr)); 434 default: 435 break; 436 } 437 break; 438 439 case IPPROTO_IPV6: 440 /* 441 * IPPROTO_IPV6 options are only supported for sockets 442 * that are using IPv6 on the wire. 443 */ 444 if (connp->conn_ipversion != IPV6_VERSION) { 445 return (-1); 446 } 447 switch (name) { 448 case IPV6_PATHMTU: 449 if (tcp->tcp_state < TCPS_ESTABLISHED) 450 return (-1); 451 break; 452 } 453 break; 454 } 455 mutex_enter(&connp->conn_lock); 456 retval = conn_opt_get(&coas, level, name, ptr); 457 mutex_exit(&connp->conn_lock); 458 return (retval); 459 } 460 461 /* 462 * We declare as 'int' rather than 'void' to satisfy pfi_t arg requirements. 463 * Parameters are assumed to be verified by the caller. 464 */ 465 /* ARGSUSED */ 466 int 467 tcp_opt_set(conn_t *connp, uint_t optset_context, int level, int name, 468 uint_t inlen, uchar_t *invalp, uint_t *outlenp, uchar_t *outvalp, 469 void *thisdg_attrs, cred_t *cr) 470 { 471 tcp_t *tcp = connp->conn_tcp; 472 int *i1 = (int *)invalp; 473 boolean_t onoff = (*i1 == 0) ? 0 : 1; 474 boolean_t checkonly; 475 int reterr; 476 tcp_stack_t *tcps = tcp->tcp_tcps; 477 conn_opt_arg_t coas; 478 uint32_t val = *((uint32_t *)invalp); 479 480 coas.coa_connp = connp; 481 coas.coa_ixa = connp->conn_ixa; 482 coas.coa_ipp = &connp->conn_xmit_ipp; 483 coas.coa_ancillary = B_FALSE; 484 coas.coa_changed = 0; 485 486 switch (optset_context) { 487 case SETFN_OPTCOM_CHECKONLY: 488 checkonly = B_TRUE; 489 /* 490 * Note: Implies T_CHECK semantics for T_OPTCOM_REQ 491 * inlen != 0 implies value supplied and 492 * we have to "pretend" to set it. 493 * inlen == 0 implies that there is no 494 * value part in T_CHECK request and just validation 495 * done elsewhere should be enough, we just return here. 496 */ 497 if (inlen == 0) { 498 *outlenp = 0; 499 return (0); 500 } 501 break; 502 case SETFN_OPTCOM_NEGOTIATE: 503 checkonly = B_FALSE; 504 break; 505 case SETFN_UD_NEGOTIATE: /* error on conn-oriented transports ? */ 506 case SETFN_CONN_NEGOTIATE: 507 checkonly = B_FALSE; 508 /* 509 * Negotiating local and "association-related" options 510 * from other (T_CONN_REQ, T_CONN_RES,T_UNITDATA_REQ) 511 * primitives is allowed by XTI, but we choose 512 * to not implement this style negotiation for Internet 513 * protocols (We interpret it is a must for OSI world but 514 * optional for Internet protocols) for all options. 515 * [ Will do only for the few options that enable test 516 * suites that our XTI implementation of this feature 517 * works for transports that do allow it ] 518 */ 519 if (!tcp_allow_connopt_set(level, name)) { 520 *outlenp = 0; 521 return (EINVAL); 522 } 523 break; 524 default: 525 /* 526 * We should never get here 527 */ 528 *outlenp = 0; 529 return (EINVAL); 530 } 531 532 ASSERT((optset_context != SETFN_OPTCOM_CHECKONLY) || 533 (optset_context == SETFN_OPTCOM_CHECKONLY && inlen != 0)); 534 535 /* 536 * For TCP, we should have no ancillary data sent down 537 * (sendmsg isn't supported for SOCK_STREAM), so thisdg_attrs 538 * has to be zero. 539 */ 540 ASSERT(thisdg_attrs == NULL); 541 542 /* 543 * For fixed length options, no sanity check 544 * of passed in length is done. It is assumed *_optcom_req() 545 * routines do the right thing. 546 */ 547 switch (level) { 548 case SOL_SOCKET: 549 switch (name) { 550 case SO_KEEPALIVE: 551 if (checkonly) { 552 /* check only case */ 553 break; 554 } 555 556 if (!onoff) { 557 if (connp->conn_keepalive) { 558 if (tcp->tcp_ka_tid != 0) { 559 (void) TCP_TIMER_CANCEL(tcp, 560 tcp->tcp_ka_tid); 561 tcp->tcp_ka_tid = 0; 562 } 563 connp->conn_keepalive = 0; 564 } 565 break; 566 } 567 if (!connp->conn_keepalive) { 568 /* Crank up the keepalive timer */ 569 tcp->tcp_ka_last_intrvl = 0; 570 tcp->tcp_ka_tid = TCP_TIMER(tcp, 571 tcp_keepalive_timer, tcp->tcp_ka_interval); 572 connp->conn_keepalive = 1; 573 } 574 break; 575 case SO_SNDBUF: { 576 if (*i1 > tcps->tcps_max_buf) { 577 *outlenp = 0; 578 return (ENOBUFS); 579 } 580 if (checkonly) 581 break; 582 583 connp->conn_sndbuf = *i1; 584 if (tcps->tcps_snd_lowat_fraction != 0) { 585 connp->conn_sndlowat = connp->conn_sndbuf / 586 tcps->tcps_snd_lowat_fraction; 587 } 588 (void) tcp_maxpsz_set(tcp, B_TRUE); 589 /* 590 * If we are flow-controlled, recheck the condition. 591 * There are apps that increase SO_SNDBUF size when 592 * flow-controlled (EWOULDBLOCK), and expect the flow 593 * control condition to be lifted right away. 594 */ 595 mutex_enter(&tcp->tcp_non_sq_lock); 596 if (tcp->tcp_flow_stopped && 597 TCP_UNSENT_BYTES(tcp) < connp->conn_sndbuf) { 598 tcp_clrqfull(tcp); 599 } 600 mutex_exit(&tcp->tcp_non_sq_lock); 601 *outlenp = inlen; 602 return (0); 603 } 604 case SO_RCVBUF: 605 if (*i1 > tcps->tcps_max_buf) { 606 *outlenp = 0; 607 return (ENOBUFS); 608 } 609 /* Silently ignore zero */ 610 if (!checkonly && *i1 != 0) { 611 *i1 = MSS_ROUNDUP(*i1, tcp->tcp_mss); 612 (void) tcp_rwnd_set(tcp, *i1); 613 } 614 /* 615 * XXX should we return the rwnd here 616 * and tcp_opt_get ? 617 */ 618 *outlenp = inlen; 619 return (0); 620 case SO_SND_COPYAVOID: 621 if (!checkonly) { 622 if (tcp->tcp_loopback || 623 (tcp->tcp_kssl_ctx != NULL) || 624 (onoff != 1) || !tcp_zcopy_check(tcp)) { 625 *outlenp = 0; 626 return (EOPNOTSUPP); 627 } 628 tcp->tcp_snd_zcopy_aware = 1; 629 } 630 *outlenp = inlen; 631 return (0); 632 } 633 break; 634 case IPPROTO_TCP: 635 switch (name) { 636 case TCP_NODELAY: 637 if (!checkonly) 638 tcp->tcp_naglim = *i1 ? 1 : tcp->tcp_mss; 639 break; 640 case TCP_NOTIFY_THRESHOLD: 641 if (!checkonly) 642 tcp->tcp_first_timer_threshold = *i1; 643 break; 644 case TCP_ABORT_THRESHOLD: 645 if (!checkonly) 646 tcp->tcp_second_timer_threshold = *i1; 647 break; 648 case TCP_CONN_NOTIFY_THRESHOLD: 649 if (!checkonly) 650 tcp->tcp_first_ctimer_threshold = *i1; 651 break; 652 case TCP_CONN_ABORT_THRESHOLD: 653 if (!checkonly) 654 tcp->tcp_second_ctimer_threshold = *i1; 655 break; 656 case TCP_RECVDSTADDR: 657 if (tcp->tcp_state > TCPS_LISTEN) { 658 *outlenp = 0; 659 return (EOPNOTSUPP); 660 } 661 /* Setting done in conn_opt_set */ 662 break; 663 case TCP_INIT_CWND: 664 if (checkonly) 665 break; 666 667 /* 668 * Only allow socket with network configuration 669 * privilege to set the initial cwnd to be larger 670 * than allowed by RFC 3390. 671 */ 672 if (val <= MIN(4, MAX(2, 4380 / tcp->tcp_mss))) { 673 tcp->tcp_init_cwnd = val; 674 break; 675 } 676 if ((reterr = secpolicy_ip_config(cr, B_TRUE)) != 0) { 677 *outlenp = 0; 678 return (reterr); 679 } 680 if (val > tcp_max_init_cwnd) { 681 *outlenp = 0; 682 return (EINVAL); 683 } 684 tcp->tcp_init_cwnd = val; 685 break; 686 case TCP_KEEPALIVE_THRESHOLD: 687 if (checkonly) 688 break; 689 690 if (*i1 < tcps->tcps_keepalive_interval_low || 691 *i1 > tcps->tcps_keepalive_interval_high) { 692 *outlenp = 0; 693 return (EINVAL); 694 } 695 if (*i1 != tcp->tcp_ka_interval) { 696 tcp->tcp_ka_interval = *i1; 697 /* 698 * Check if we need to restart the 699 * keepalive timer. 700 */ 701 if (tcp->tcp_ka_tid != 0) { 702 ASSERT(connp->conn_keepalive); 703 (void) TCP_TIMER_CANCEL(tcp, 704 tcp->tcp_ka_tid); 705 tcp->tcp_ka_last_intrvl = 0; 706 tcp->tcp_ka_tid = TCP_TIMER(tcp, 707 tcp_keepalive_timer, 708 tcp->tcp_ka_interval); 709 } 710 } 711 break; 712 case TCP_KEEPALIVE_ABORT_THRESHOLD: 713 if (!checkonly) { 714 if (*i1 < 715 tcps->tcps_keepalive_abort_interval_low || 716 *i1 > 717 tcps->tcps_keepalive_abort_interval_high) { 718 *outlenp = 0; 719 return (EINVAL); 720 } 721 tcp->tcp_ka_abort_thres = *i1; 722 } 723 break; 724 case TCP_CORK: 725 if (!checkonly) { 726 /* 727 * if tcp->tcp_cork was set and is now 728 * being unset, we have to make sure that 729 * the remaining data gets sent out. Also 730 * unset tcp->tcp_cork so that tcp_wput_data() 731 * can send data even if it is less than mss 732 */ 733 if (tcp->tcp_cork && onoff == 0 && 734 tcp->tcp_unsent > 0) { 735 tcp->tcp_cork = B_FALSE; 736 tcp_wput_data(tcp, NULL, B_FALSE); 737 } 738 tcp->tcp_cork = onoff; 739 } 740 break; 741 case TCP_RTO_INITIAL: { 742 clock_t rto; 743 744 if (checkonly || val == 0) 745 break; 746 747 /* 748 * Sanity checks 749 * 750 * The initial RTO should be bounded by the minimum 751 * and maximum RTO. And it should also be smaller 752 * than the connect attempt abort timeout. Otherwise, 753 * the connection won't be aborted in a period 754 * reasonably close to that timeout. 755 */ 756 if (val < tcp->tcp_rto_min || val > tcp->tcp_rto_max || 757 val > tcp->tcp_second_ctimer_threshold || 758 val < tcps->tcps_rexmit_interval_initial_low || 759 val > tcps->tcps_rexmit_interval_initial_high) { 760 *outlenp = 0; 761 return (EINVAL); 762 } 763 tcp->tcp_rto_initial = val; 764 765 /* 766 * If TCP has not sent anything, need to re-calculate 767 * tcp_rto. Otherwise, this option change does not 768 * really affect anything. 769 */ 770 if (tcp->tcp_state >= TCPS_SYN_SENT) 771 break; 772 773 tcp->tcp_rtt_sa = tcp->tcp_rto_initial << 2; 774 tcp->tcp_rtt_sd = tcp->tcp_rto_initial >> 1; 775 rto = (tcp->tcp_rtt_sa >> 3) + tcp->tcp_rtt_sd + 776 tcps->tcps_rexmit_interval_extra + 777 (tcp->tcp_rtt_sa >> 5) + 778 tcps->tcps_conn_grace_period; 779 TCP_SET_RTO(tcp, rto); 780 break; 781 } 782 case TCP_RTO_MIN: 783 if (checkonly || val == 0) 784 break; 785 786 if (val < tcps->tcps_rexmit_interval_min_low || 787 val > tcps->tcps_rexmit_interval_min_high || 788 val > tcp->tcp_rto_max) { 789 *outlenp = 0; 790 return (EINVAL); 791 } 792 tcp->tcp_rto_min = val; 793 if (tcp->tcp_rto < val) 794 tcp->tcp_rto = val; 795 break; 796 case TCP_RTO_MAX: 797 if (checkonly || val == 0) 798 break; 799 800 /* 801 * Sanity checks 802 * 803 * The maximum RTO should not be larger than the 804 * connection abort timeout. Otherwise, the 805 * connection won't be aborted in a period reasonably 806 * close to that timeout. 807 */ 808 if (val < tcps->tcps_rexmit_interval_max_low || 809 val > tcps->tcps_rexmit_interval_max_high || 810 val < tcp->tcp_rto_min || 811 val > tcp->tcp_second_timer_threshold) { 812 *outlenp = 0; 813 return (EINVAL); 814 } 815 tcp->tcp_rto_max = val; 816 if (tcp->tcp_rto > val) 817 tcp->tcp_rto = val; 818 break; 819 case TCP_LINGER2: 820 if (checkonly || *i1 == 0) 821 break; 822 823 /* 824 * Note that the option value's unit is second. And 825 * the value should be bigger than the private 826 * parameter tcp_fin_wait_2_flush_interval's lower 827 * bound and smaller than the current value of that 828 * parameter. It should be smaller than the current 829 * value to avoid an app setting TCP_LINGER2 to a big 830 * value, causing resource to be held up too long in 831 * FIN-WAIT-2 state. 832 */ 833 if (*i1 < 0 || 834 tcps->tcps_fin_wait_2_flush_interval_low/SECONDS > 835 *i1 || 836 tcps->tcps_fin_wait_2_flush_interval/SECONDS < 837 *i1) { 838 *outlenp = 0; 839 return (EINVAL); 840 } 841 tcp->tcp_fin_wait_2_flush_interval = *i1 * SECONDS; 842 break; 843 default: 844 break; 845 } 846 break; 847 case IPPROTO_IP: 848 if (connp->conn_family != AF_INET) { 849 *outlenp = 0; 850 return (EINVAL); 851 } 852 switch (name) { 853 case IP_SEC_OPT: 854 /* 855 * We should not allow policy setting after 856 * we start listening for connections. 857 */ 858 if (tcp->tcp_state == TCPS_LISTEN) { 859 return (EINVAL); 860 } 861 break; 862 } 863 break; 864 case IPPROTO_IPV6: 865 /* 866 * IPPROTO_IPV6 options are only supported for sockets 867 * that are using IPv6 on the wire. 868 */ 869 if (connp->conn_ipversion != IPV6_VERSION) { 870 *outlenp = 0; 871 return (EINVAL); 872 } 873 874 switch (name) { 875 case IPV6_RECVPKTINFO: 876 if (!checkonly) { 877 /* Force it to be sent up with the next msg */ 878 tcp->tcp_recvifindex = 0; 879 } 880 break; 881 case IPV6_RECVTCLASS: 882 if (!checkonly) { 883 /* Force it to be sent up with the next msg */ 884 tcp->tcp_recvtclass = 0xffffffffU; 885 } 886 break; 887 case IPV6_RECVHOPLIMIT: 888 if (!checkonly) { 889 /* Force it to be sent up with the next msg */ 890 tcp->tcp_recvhops = 0xffffffffU; 891 } 892 break; 893 case IPV6_PKTINFO: 894 /* This is an extra check for TCP */ 895 if (inlen == sizeof (struct in6_pktinfo)) { 896 struct in6_pktinfo *pkti; 897 898 pkti = (struct in6_pktinfo *)invalp; 899 /* 900 * RFC 3542 states that ipi6_addr must be 901 * the unspecified address when setting the 902 * IPV6_PKTINFO sticky socket option on a 903 * TCP socket. 904 */ 905 if (!IN6_IS_ADDR_UNSPECIFIED(&pkti->ipi6_addr)) 906 return (EINVAL); 907 } 908 break; 909 case IPV6_SEC_OPT: 910 /* 911 * We should not allow policy setting after 912 * we start listening for connections. 913 */ 914 if (tcp->tcp_state == TCPS_LISTEN) { 915 return (EINVAL); 916 } 917 break; 918 } 919 break; 920 } 921 reterr = conn_opt_set(&coas, level, name, inlen, invalp, 922 checkonly, cr); 923 if (reterr != 0) { 924 *outlenp = 0; 925 return (reterr); 926 } 927 928 /* 929 * Common case of OK return with outval same as inval 930 */ 931 if (invalp != outvalp) { 932 /* don't trust bcopy for identical src/dst */ 933 (void) bcopy(invalp, outvalp, inlen); 934 } 935 *outlenp = inlen; 936 937 if (coas.coa_changed & COA_HEADER_CHANGED) { 938 /* If we are connected we rebuilt the headers */ 939 if (!IN6_IS_ADDR_UNSPECIFIED(&connp->conn_faddr_v6) && 940 !IN6_IS_ADDR_V4MAPPED_ANY(&connp->conn_faddr_v6)) { 941 reterr = tcp_build_hdrs(tcp); 942 if (reterr != 0) 943 return (reterr); 944 } 945 } 946 if (coas.coa_changed & COA_ROUTE_CHANGED) { 947 in6_addr_t nexthop; 948 949 /* 950 * If we are connected we re-cache the information. 951 * We ignore errors to preserve BSD behavior. 952 * Note that we don't redo IPsec policy lookup here 953 * since the final destination (or source) didn't change. 954 */ 955 ip_attr_nexthop(&connp->conn_xmit_ipp, connp->conn_ixa, 956 &connp->conn_faddr_v6, &nexthop); 957 958 if (!IN6_IS_ADDR_UNSPECIFIED(&connp->conn_faddr_v6) && 959 !IN6_IS_ADDR_V4MAPPED_ANY(&connp->conn_faddr_v6)) { 960 (void) ip_attr_connect(connp, connp->conn_ixa, 961 &connp->conn_laddr_v6, &connp->conn_faddr_v6, 962 &nexthop, connp->conn_fport, NULL, NULL, 963 IPDF_VERIFY_DST); 964 } 965 } 966 if ((coas.coa_changed & COA_SNDBUF_CHANGED) && !IPCL_IS_NONSTR(connp)) { 967 connp->conn_wq->q_hiwat = connp->conn_sndbuf; 968 } 969 if (coas.coa_changed & COA_WROFF_CHANGED) { 970 connp->conn_wroff = connp->conn_ht_iphc_allocated + 971 tcps->tcps_wroff_xtra; 972 (void) proto_set_tx_wroff(connp->conn_rq, connp, 973 connp->conn_wroff); 974 } 975 if (coas.coa_changed & COA_OOBINLINE_CHANGED) { 976 if (IPCL_IS_NONSTR(connp)) 977 proto_set_rx_oob_opt(connp, onoff); 978 } 979 return (0); 980 } 981