1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved. 23 * Copyright (c) 2011 Nexenta Systems, Inc. All rights reserved. 24 * Copyright 2019 Joyent, Inc. 25 * Copyright (c) 2016 by Delphix. All rights reserved. 26 * Copyright 2020 OmniOS Community Edition (OmniOSce) Association. 27 */ 28 29 #include <sys/types.h> 30 #include <sys/stream.h> 31 #define _SUN_TPI_VERSION 2 32 #include <sys/tihdr.h> 33 #include <sys/socket.h> 34 #include <sys/xti_xtiopt.h> 35 #include <sys/xti_inet.h> 36 #include <sys/policy.h> 37 38 #include <inet/cc.h> 39 #include <inet/common.h> 40 #include <netinet/ip6.h> 41 #include <inet/ip.h> 42 43 #include <netinet/in.h> 44 #include <netinet/tcp.h> 45 #include <inet/optcom.h> 46 #include <inet/proto_set.h> 47 #include <inet/tcp_impl.h> 48 49 static int tcp_opt_default(queue_t *, int, int, uchar_t *); 50 51 /* 52 * Table of all known options handled on a TCP protocol stack. 53 * 54 * Note: This table contains options processed by both TCP and IP levels 55 * and is the superset of options that can be performed on a TCP over IP 56 * stack. 57 */ 58 opdes_t tcp_opt_arr[] = { 59 60 { SO_LINGER, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, 61 sizeof (struct linger), 0 }, 62 63 { SO_DEBUG, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 }, 64 { SO_KEEPALIVE, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 }, 65 { SO_DONTROUTE, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 }, 66 { SO_USELOOPBACK, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 67 }, 68 { SO_BROADCAST, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 }, 69 { SO_REUSEADDR, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 }, 70 { SO_OOBINLINE, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 }, 71 { SO_TYPE, SOL_SOCKET, OA_R, OA_R, OP_NP, 0, sizeof (int), 0 }, 72 { SO_SNDBUF, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 }, 73 { SO_RCVBUF, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 }, 74 { SO_SNDTIMEO, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, 75 sizeof (struct timeval), 0 }, 76 { SO_RCVTIMEO, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, 77 sizeof (struct timeval), 0 }, 78 { SO_DGRAM_ERRIND, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 79 }, 80 { SO_SND_COPYAVOID, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 }, 81 { SO_ANON_MLP, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 82 0 }, 83 { SO_MAC_EXEMPT, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 84 0 }, 85 { SO_MAC_IMPLICIT, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 86 0 }, 87 { SO_ALLZONES, SOL_SOCKET, OA_R, OA_RW, OP_CONFIG, 0, sizeof (int), 88 0 }, 89 { SO_EXCLBIND, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 }, 90 91 { SO_DOMAIN, SOL_SOCKET, OA_R, OA_R, OP_NP, 0, sizeof (int), 0 }, 92 93 { SO_PROTOTYPE, SOL_SOCKET, OA_R, OA_R, OP_NP, 0, sizeof (int), 0 }, 94 95 { TCP_NODELAY, IPPROTO_TCP, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 96 }, 97 { TCP_MAXSEG, IPPROTO_TCP, OA_R, OA_R, OP_NP, 0, sizeof (uint_t), 98 536 }, 99 100 { TCP_NOTIFY_THRESHOLD, IPPROTO_TCP, OA_RW, OA_RW, OP_NP, 101 OP_DEF_FN, sizeof (int), -1 /* not initialized */ }, 102 103 { TCP_ABORT_THRESHOLD, IPPROTO_TCP, OA_RW, OA_RW, OP_NP, 104 OP_DEF_FN, sizeof (int), -1 /* not initialized */ }, 105 106 { TCP_CONN_NOTIFY_THRESHOLD, IPPROTO_TCP, OA_RW, OA_RW, OP_NP, 107 OP_DEF_FN, sizeof (int), -1 /* not initialized */ }, 108 109 { TCP_CONN_ABORT_THRESHOLD, IPPROTO_TCP, OA_RW, OA_RW, OP_NP, 110 OP_DEF_FN, sizeof (int), -1 /* not initialized */ }, 111 112 { TCP_RECVDSTADDR, IPPROTO_TCP, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 113 0 }, 114 115 { TCP_ANONPRIVBIND, IPPROTO_TCP, OA_R, OA_RW, OP_PRIVPORT, 0, 116 sizeof (int), 0 }, 117 118 { TCP_EXCLBIND, IPPROTO_TCP, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 119 }, 120 121 { TCP_INIT_CWND, IPPROTO_TCP, OA_RW, OA_RW, OP_CONFIG, 0, 122 sizeof (int), 0 }, 123 124 { TCP_KEEPALIVE_THRESHOLD, IPPROTO_TCP, OA_RW, OA_RW, OP_NP, 0, 125 sizeof (int), 0 }, 126 127 { TCP_KEEPIDLE, IPPROTO_TCP, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 }, 128 129 { TCP_KEEPCNT, IPPROTO_TCP, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 }, 130 131 { TCP_KEEPINTVL, IPPROTO_TCP, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 }, 132 133 { TCP_KEEPALIVE_ABORT_THRESHOLD, IPPROTO_TCP, OA_RW, OA_RW, OP_NP, 0, 134 sizeof (int), 0 }, 135 136 { TCP_CORK, IPPROTO_TCP, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 }, 137 138 { TCP_RTO_INITIAL, IPPROTO_TCP, OA_RW, OA_RW, OP_NP, 0, sizeof (uint32_t), 0 }, 139 140 { TCP_RTO_MIN, IPPROTO_TCP, OA_RW, OA_RW, OP_NP, 0, sizeof (uint32_t), 0 }, 141 142 { TCP_RTO_MAX, IPPROTO_TCP, OA_RW, OA_RW, OP_NP, 0, sizeof (uint32_t), 0 }, 143 144 { TCP_LINGER2, IPPROTO_TCP, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 }, 145 146 { TCP_CONGESTION, IPPROTO_TCP, OA_RW, OA_RW, OP_NP, 147 OP_VARLEN, CC_ALGO_NAME_MAX, 0 }, 148 149 { IP_OPTIONS, IPPROTO_IP, OA_RW, OA_RW, OP_NP, 150 (OP_VARLEN|OP_NODEFAULT), 151 IP_MAX_OPT_LENGTH + IP_ADDR_LEN, -1 /* not initialized */ }, 152 { T_IP_OPTIONS, IPPROTO_IP, OA_RW, OA_RW, OP_NP, 153 (OP_VARLEN|OP_NODEFAULT), 154 IP_MAX_OPT_LENGTH + IP_ADDR_LEN, -1 /* not initialized */ }, 155 156 { IP_TOS, IPPROTO_IP, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 }, 157 { T_IP_TOS, IPPROTO_IP, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 }, 158 { IP_TTL, IPPROTO_IP, OA_RW, OA_RW, OP_NP, OP_DEF_FN, 159 sizeof (int), -1 /* not initialized */ }, 160 { IP_RECVTOS, IPPROTO_IP, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 }, 161 162 { IP_SEC_OPT, IPPROTO_IP, OA_RW, OA_RW, OP_NP, OP_NODEFAULT, 163 sizeof (ipsec_req_t), -1 /* not initialized */ }, 164 165 { IP_BOUND_IF, IPPROTO_IP, OA_RW, OA_RW, OP_NP, 0, 166 sizeof (int), 0 /* no ifindex */ }, 167 168 { IP_UNSPEC_SRC, IPPROTO_IP, OA_R, OA_RW, OP_RAW, 0, 169 sizeof (int), 0 }, 170 171 { IPV6_UNICAST_HOPS, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, OP_DEF_FN, 172 sizeof (int), -1 /* not initialized */ }, 173 174 { IPV6_BOUND_IF, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 0, 175 sizeof (int), 0 /* no ifindex */ }, 176 177 { IP_DONTFRAG, IPPROTO_IP, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 }, 178 179 { IP_NEXTHOP, IPPROTO_IP, OA_R, OA_RW, OP_CONFIG, 0, 180 sizeof (in_addr_t), -1 /* not initialized */ }, 181 182 { IPV6_UNSPEC_SRC, IPPROTO_IPV6, OA_R, OA_RW, OP_RAW, 0, 183 sizeof (int), 0 }, 184 185 { IPV6_PKTINFO, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 186 (OP_NODEFAULT|OP_VARLEN), 187 sizeof (struct in6_pktinfo), -1 /* not initialized */ }, 188 { IPV6_NEXTHOP, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 189 OP_NODEFAULT, 190 sizeof (sin6_t), -1 /* not initialized */ }, 191 { IPV6_HOPOPTS, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 192 (OP_VARLEN|OP_NODEFAULT), 255*8, 193 -1 /* not initialized */ }, 194 { IPV6_DSTOPTS, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 195 (OP_VARLEN|OP_NODEFAULT), 255*8, 196 -1 /* not initialized */ }, 197 { IPV6_RTHDRDSTOPTS, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 198 (OP_VARLEN|OP_NODEFAULT), 255*8, 199 -1 /* not initialized */ }, 200 { IPV6_RTHDR, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 201 (OP_VARLEN|OP_NODEFAULT), 255*8, 202 -1 /* not initialized */ }, 203 { IPV6_TCLASS, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 204 OP_NODEFAULT, 205 sizeof (int), -1 /* not initialized */ }, 206 { IPV6_PATHMTU, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 207 OP_NODEFAULT, 208 sizeof (struct ip6_mtuinfo), -1 /* not initialized */ }, 209 { IPV6_DONTFRAG, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 0, 210 sizeof (int), 0 }, 211 { IPV6_USE_MIN_MTU, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 0, 212 sizeof (int), 0 }, 213 { IPV6_V6ONLY, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 0, 214 sizeof (int), 0 }, 215 216 /* Enable receipt of ancillary data */ 217 { IPV6_RECVPKTINFO, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 0, 218 sizeof (int), 0 }, 219 { IPV6_RECVHOPLIMIT, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 0, 220 sizeof (int), 0 }, 221 { IPV6_RECVHOPOPTS, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 0, 222 sizeof (int), 0 }, 223 { _OLD_IPV6_RECVDSTOPTS, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 0, 224 sizeof (int), 0 }, 225 { IPV6_RECVDSTOPTS, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 0, 226 sizeof (int), 0 }, 227 { IPV6_RECVRTHDR, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 0, 228 sizeof (int), 0 }, 229 { IPV6_RECVRTHDRDSTOPTS, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 0, 230 sizeof (int), 0 }, 231 { IPV6_RECVTCLASS, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 0, 232 sizeof (int), 0 }, 233 234 { IPV6_SEC_OPT, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, OP_NODEFAULT, 235 sizeof (ipsec_req_t), -1 /* not initialized */ }, 236 { IPV6_SRC_PREFERENCES, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 0, 237 sizeof (uint32_t), IPV6_PREFER_SRC_DEFAULT }, 238 }; 239 240 /* 241 * Table of all supported levels 242 * Note: Some levels (e.g. XTI_GENERIC) may be valid but may not have 243 * any supported options so we need this info separately. 244 * 245 * This is needed only for topmost tpi providers and is used only by 246 * XTI interfaces. 247 */ 248 optlevel_t tcp_valid_levels_arr[] = { 249 XTI_GENERIC, 250 SOL_SOCKET, 251 IPPROTO_TCP, 252 IPPROTO_IP, 253 IPPROTO_IPV6 254 }; 255 256 257 #define TCP_OPT_ARR_CNT A_CNT(tcp_opt_arr) 258 #define TCP_VALID_LEVELS_CNT A_CNT(tcp_valid_levels_arr) 259 260 uint_t tcp_max_optsize; /* initialized when TCP driver is loaded */ 261 262 /* 263 * Initialize option database object for TCP 264 * 265 * This object represents database of options to search passed to 266 * {sock,tpi}optcom_req() interface routine to take care of option 267 * management and associated methods. 268 */ 269 270 optdb_obj_t tcp_opt_obj = { 271 tcp_opt_default, /* TCP default value function pointer */ 272 tcp_tpi_opt_get, /* TCP get function pointer */ 273 tcp_tpi_opt_set, /* TCP set function pointer */ 274 TCP_OPT_ARR_CNT, /* TCP option database count of entries */ 275 tcp_opt_arr, /* TCP option database */ 276 TCP_VALID_LEVELS_CNT, /* TCP valid level count of entries */ 277 tcp_valid_levels_arr /* TCP valid level array */ 278 }; 279 280 static int tcp_max_init_cwnd = TCP_MAX_INIT_CWND; 281 282 /* 283 * Some TCP options can be "set" by requesting them in the option 284 * buffer. This is needed for XTI feature test though we do not 285 * allow it in general. We interpret that this mechanism is more 286 * applicable to OSI protocols and need not be allowed in general. 287 * This routine filters out options for which it is not allowed (most) 288 * and lets through those (few) for which it is. [ The XTI interface 289 * test suite specifics will imply that any XTI_GENERIC level XTI_* if 290 * ever implemented will have to be allowed here ]. 291 */ 292 static boolean_t 293 tcp_allow_connopt_set(int level, int name) 294 { 295 296 switch (level) { 297 case IPPROTO_TCP: 298 switch (name) { 299 case TCP_NODELAY: 300 return (B_TRUE); 301 default: 302 return (B_FALSE); 303 } 304 /*NOTREACHED*/ 305 default: 306 return (B_FALSE); 307 } 308 /*NOTREACHED*/ 309 } 310 311 /* 312 * This routine gets default values of certain options whose default 313 * values are maintained by protocol specific code 314 */ 315 /* ARGSUSED */ 316 static int 317 tcp_opt_default(queue_t *q, int level, int name, uchar_t *ptr) 318 { 319 int32_t *i1 = (int32_t *)ptr; 320 tcp_stack_t *tcps = Q_TO_TCP(q)->tcp_tcps; 321 322 switch (level) { 323 case IPPROTO_TCP: 324 switch (name) { 325 case TCP_NOTIFY_THRESHOLD: 326 *i1 = tcps->tcps_ip_notify_interval; 327 break; 328 case TCP_ABORT_THRESHOLD: 329 *i1 = tcps->tcps_ip_abort_interval; 330 break; 331 case TCP_CONN_NOTIFY_THRESHOLD: 332 *i1 = tcps->tcps_ip_notify_cinterval; 333 break; 334 case TCP_CONN_ABORT_THRESHOLD: 335 *i1 = tcps->tcps_ip_abort_cinterval; 336 break; 337 default: 338 return (-1); 339 } 340 break; 341 case IPPROTO_IP: 342 switch (name) { 343 case IP_TTL: 344 *i1 = tcps->tcps_ipv4_ttl; 345 break; 346 default: 347 return (-1); 348 } 349 break; 350 case IPPROTO_IPV6: 351 switch (name) { 352 case IPV6_UNICAST_HOPS: 353 *i1 = tcps->tcps_ipv6_hoplimit; 354 break; 355 default: 356 return (-1); 357 } 358 break; 359 default: 360 return (-1); 361 } 362 return (sizeof (int)); 363 } 364 365 /* 366 * TCP routine to get the values of options. 367 */ 368 int 369 tcp_opt_get(conn_t *connp, int level, int name, uchar_t *ptr) 370 { 371 int *i1 = (int *)ptr; 372 tcp_t *tcp = connp->conn_tcp; 373 conn_opt_arg_t coas; 374 int retval; 375 376 coas.coa_connp = connp; 377 coas.coa_ixa = connp->conn_ixa; 378 coas.coa_ipp = &connp->conn_xmit_ipp; 379 coas.coa_ancillary = B_FALSE; 380 coas.coa_changed = 0; 381 382 switch (level) { 383 case SOL_SOCKET: 384 switch (name) { 385 case SO_SND_COPYAVOID: 386 *i1 = tcp->tcp_snd_zcopy_on ? 387 SO_SND_COPYAVOID : 0; 388 return (sizeof (int)); 389 case SO_ACCEPTCONN: 390 *i1 = (tcp->tcp_state == TCPS_LISTEN); 391 return (sizeof (int)); 392 } 393 break; 394 case IPPROTO_TCP: 395 switch (name) { 396 case TCP_NODELAY: 397 *i1 = (tcp->tcp_naglim == 1) ? TCP_NODELAY : 0; 398 return (sizeof (int)); 399 case TCP_MAXSEG: 400 *i1 = tcp->tcp_mss; 401 return (sizeof (int)); 402 case TCP_NOTIFY_THRESHOLD: 403 *i1 = (int)tcp->tcp_first_timer_threshold; 404 return (sizeof (int)); 405 case TCP_ABORT_THRESHOLD: 406 *i1 = tcp->tcp_second_timer_threshold; 407 return (sizeof (int)); 408 case TCP_CONN_NOTIFY_THRESHOLD: 409 *i1 = tcp->tcp_first_ctimer_threshold; 410 return (sizeof (int)); 411 case TCP_CONN_ABORT_THRESHOLD: 412 *i1 = tcp->tcp_second_ctimer_threshold; 413 return (sizeof (int)); 414 case TCP_INIT_CWND: 415 *i1 = tcp->tcp_init_cwnd; 416 return (sizeof (int)); 417 case TCP_KEEPALIVE_THRESHOLD: 418 *i1 = tcp->tcp_ka_interval; 419 return (sizeof (int)); 420 421 /* 422 * TCP_KEEPIDLE expects value in seconds, but 423 * tcp_ka_interval is in milliseconds. 424 */ 425 case TCP_KEEPIDLE: 426 *i1 = tcp->tcp_ka_interval / 1000; 427 return (sizeof (int)); 428 case TCP_KEEPCNT: 429 *i1 = tcp->tcp_ka_cnt; 430 return (sizeof (int)); 431 432 /* 433 * TCP_KEEPINTVL expects value in seconds, but 434 * tcp_ka_rinterval is in milliseconds. 435 */ 436 case TCP_KEEPINTVL: 437 *i1 = tcp->tcp_ka_rinterval / 1000; 438 return (sizeof (int)); 439 case TCP_KEEPALIVE_ABORT_THRESHOLD: 440 *i1 = tcp->tcp_ka_abort_thres; 441 return (sizeof (int)); 442 case TCP_CONGESTION: { 443 size_t len = strlcpy((char *)ptr, CC_ALGO(tcp)->name, 444 CC_ALGO_NAME_MAX); 445 if (len >= CC_ALGO_NAME_MAX) 446 return (-1); 447 return (len + 1); 448 } 449 case TCP_CORK: 450 *i1 = tcp->tcp_cork; 451 return (sizeof (int)); 452 case TCP_RTO_INITIAL: 453 *i1 = tcp->tcp_rto_initial; 454 return (sizeof (uint32_t)); 455 case TCP_RTO_MIN: 456 *i1 = tcp->tcp_rto_min; 457 return (sizeof (uint32_t)); 458 case TCP_RTO_MAX: 459 *i1 = tcp->tcp_rto_max; 460 return (sizeof (uint32_t)); 461 case TCP_LINGER2: 462 *i1 = tcp->tcp_fin_wait_2_flush_interval / SECONDS; 463 return (sizeof (int)); 464 } 465 break; 466 case IPPROTO_IP: 467 if (connp->conn_family != AF_INET) 468 return (-1); 469 switch (name) { 470 case IP_OPTIONS: 471 case T_IP_OPTIONS: 472 /* Caller ensures enough space */ 473 return (ip_opt_get_user(connp, ptr)); 474 default: 475 break; 476 } 477 break; 478 479 case IPPROTO_IPV6: 480 /* 481 * IPPROTO_IPV6 options are only supported for sockets 482 * that are using IPv6 on the wire. 483 */ 484 if (connp->conn_ipversion != IPV6_VERSION) { 485 return (-1); 486 } 487 switch (name) { 488 case IPV6_PATHMTU: 489 if (tcp->tcp_state < TCPS_ESTABLISHED) 490 return (-1); 491 break; 492 } 493 break; 494 } 495 mutex_enter(&connp->conn_lock); 496 retval = conn_opt_get(&coas, level, name, ptr); 497 mutex_exit(&connp->conn_lock); 498 return (retval); 499 } 500 501 /* 502 * We declare as 'int' rather than 'void' to satisfy pfi_t arg requirements. 503 * Parameters are assumed to be verified by the caller. 504 */ 505 /* ARGSUSED */ 506 int 507 tcp_opt_set(conn_t *connp, uint_t optset_context, int level, int name, 508 uint_t inlen, uchar_t *invalp, uint_t *outlenp, uchar_t *outvalp, 509 void *thisdg_attrs, cred_t *cr) 510 { 511 tcp_t *tcp = connp->conn_tcp; 512 int *i1 = (int *)invalp; 513 boolean_t onoff = (*i1 == 0) ? 0 : 1; 514 boolean_t checkonly; 515 int reterr; 516 tcp_stack_t *tcps = tcp->tcp_tcps; 517 conn_opt_arg_t coas; 518 uint32_t val = *((uint32_t *)invalp); 519 520 coas.coa_connp = connp; 521 coas.coa_ixa = connp->conn_ixa; 522 coas.coa_ipp = &connp->conn_xmit_ipp; 523 coas.coa_ancillary = B_FALSE; 524 coas.coa_changed = 0; 525 526 switch (optset_context) { 527 case SETFN_OPTCOM_CHECKONLY: 528 checkonly = B_TRUE; 529 /* 530 * Note: Implies T_CHECK semantics for T_OPTCOM_REQ 531 * inlen != 0 implies value supplied and 532 * we have to "pretend" to set it. 533 * inlen == 0 implies that there is no 534 * value part in T_CHECK request and just validation 535 * done elsewhere should be enough, we just return here. 536 */ 537 if (inlen == 0) { 538 *outlenp = 0; 539 return (0); 540 } 541 break; 542 case SETFN_OPTCOM_NEGOTIATE: 543 checkonly = B_FALSE; 544 break; 545 case SETFN_UD_NEGOTIATE: /* error on conn-oriented transports ? */ 546 case SETFN_CONN_NEGOTIATE: 547 checkonly = B_FALSE; 548 /* 549 * Negotiating local and "association-related" options 550 * from other (T_CONN_REQ, T_CONN_RES,T_UNITDATA_REQ) 551 * primitives is allowed by XTI, but we choose 552 * to not implement this style negotiation for Internet 553 * protocols (We interpret it is a must for OSI world but 554 * optional for Internet protocols) for all options. 555 * [ Will do only for the few options that enable test 556 * suites that our XTI implementation of this feature 557 * works for transports that do allow it ] 558 */ 559 if (!tcp_allow_connopt_set(level, name)) { 560 *outlenp = 0; 561 return (EINVAL); 562 } 563 break; 564 default: 565 /* 566 * We should never get here 567 */ 568 *outlenp = 0; 569 return (EINVAL); 570 } 571 572 ASSERT((optset_context != SETFN_OPTCOM_CHECKONLY) || 573 (optset_context == SETFN_OPTCOM_CHECKONLY && inlen != 0)); 574 575 /* 576 * For TCP, we should have no ancillary data sent down 577 * (sendmsg isn't supported for SOCK_STREAM), so thisdg_attrs 578 * has to be zero. 579 */ 580 ASSERT(thisdg_attrs == NULL); 581 582 /* 583 * For fixed length options, no sanity check 584 * of passed in length is done. It is assumed *_optcom_req() 585 * routines do the right thing. 586 */ 587 switch (level) { 588 case SOL_SOCKET: 589 switch (name) { 590 case SO_KEEPALIVE: 591 if (checkonly) { 592 /* check only case */ 593 break; 594 } 595 596 if (!onoff) { 597 if (connp->conn_keepalive) { 598 if (tcp->tcp_ka_tid != 0) { 599 (void) TCP_TIMER_CANCEL(tcp, 600 tcp->tcp_ka_tid); 601 tcp->tcp_ka_tid = 0; 602 } 603 connp->conn_keepalive = 0; 604 } 605 break; 606 } 607 if (!connp->conn_keepalive) { 608 /* Crank up the keepalive timer */ 609 tcp->tcp_ka_last_intrvl = 0; 610 tcp->tcp_ka_tid = TCP_TIMER(tcp, 611 tcp_keepalive_timer, tcp->tcp_ka_interval); 612 connp->conn_keepalive = 1; 613 } 614 break; 615 case SO_SNDBUF: { 616 if (*i1 > tcps->tcps_max_buf) { 617 *outlenp = 0; 618 return (ENOBUFS); 619 } 620 if (checkonly) 621 break; 622 623 connp->conn_sndbuf = *i1; 624 if (tcps->tcps_snd_lowat_fraction != 0) { 625 connp->conn_sndlowat = connp->conn_sndbuf / 626 tcps->tcps_snd_lowat_fraction; 627 } 628 (void) tcp_maxpsz_set(tcp, B_TRUE); 629 /* 630 * If we are flow-controlled, recheck the condition. 631 * There are apps that increase SO_SNDBUF size when 632 * flow-controlled (EWOULDBLOCK), and expect the flow 633 * control condition to be lifted right away. 634 */ 635 mutex_enter(&tcp->tcp_non_sq_lock); 636 if (tcp->tcp_flow_stopped && 637 TCP_UNSENT_BYTES(tcp) < connp->conn_sndbuf) { 638 tcp_clrqfull(tcp); 639 } 640 mutex_exit(&tcp->tcp_non_sq_lock); 641 *outlenp = inlen; 642 return (0); 643 } 644 case SO_RCVBUF: 645 if (*i1 > tcps->tcps_max_buf) { 646 *outlenp = 0; 647 return (ENOBUFS); 648 } 649 /* Silently ignore zero */ 650 if (!checkonly && *i1 != 0) { 651 *i1 = MSS_ROUNDUP(*i1, tcp->tcp_mss); 652 (void) tcp_rwnd_set(tcp, *i1); 653 } 654 /* 655 * XXX should we return the rwnd here 656 * and tcp_opt_get ? 657 */ 658 *outlenp = inlen; 659 return (0); 660 case SO_SND_COPYAVOID: 661 if (!checkonly) { 662 if (tcp->tcp_loopback || 663 (onoff != 1) || !tcp_zcopy_check(tcp)) { 664 *outlenp = 0; 665 return (EOPNOTSUPP); 666 } 667 tcp->tcp_snd_zcopy_aware = 1; 668 } 669 *outlenp = inlen; 670 return (0); 671 } 672 break; 673 case IPPROTO_TCP: 674 switch (name) { 675 case TCP_NODELAY: 676 if (!checkonly) 677 tcp->tcp_naglim = *i1 ? 1 : tcp->tcp_mss; 678 break; 679 case TCP_NOTIFY_THRESHOLD: 680 if (!checkonly) 681 tcp->tcp_first_timer_threshold = *i1; 682 break; 683 case TCP_ABORT_THRESHOLD: 684 if (!checkonly) 685 tcp->tcp_second_timer_threshold = *i1; 686 break; 687 case TCP_CONN_NOTIFY_THRESHOLD: 688 if (!checkonly) 689 tcp->tcp_first_ctimer_threshold = *i1; 690 break; 691 case TCP_CONN_ABORT_THRESHOLD: 692 if (!checkonly) 693 tcp->tcp_second_ctimer_threshold = *i1; 694 break; 695 case TCP_RECVDSTADDR: 696 if (tcp->tcp_state > TCPS_LISTEN) { 697 *outlenp = 0; 698 return (EOPNOTSUPP); 699 } 700 /* Setting done in conn_opt_set */ 701 break; 702 case TCP_INIT_CWND: 703 if (checkonly) 704 break; 705 706 /* 707 * Only allow socket with network configuration 708 * privilege to set the initial cwnd to be larger 709 * than allowed by RFC 3390. 710 */ 711 if (val > MIN(4, MAX(2, 4380 / tcp->tcp_mss))) { 712 if ((reterr = secpolicy_ip_config(cr, B_TRUE)) 713 != 0) { 714 *outlenp = 0; 715 return (reterr); 716 } 717 if (val > tcp_max_init_cwnd) { 718 *outlenp = 0; 719 return (EINVAL); 720 } 721 } 722 723 tcp->tcp_init_cwnd = val; 724 725 /* 726 * If the socket is connected, AND no outbound data 727 * has been sent, reset the actual cwnd values. 728 */ 729 if (tcp->tcp_state == TCPS_ESTABLISHED && 730 tcp->tcp_iss == tcp->tcp_snxt - 1) { 731 tcp->tcp_cwnd = 732 MIN(tcp->tcp_rwnd, val * tcp->tcp_mss); 733 } 734 break; 735 736 /* 737 * TCP_KEEPIDLE is in seconds but TCP_KEEPALIVE_THRESHOLD 738 * is in milliseconds. TCP_KEEPIDLE is introduced for 739 * compatibility with other Unix flavors. 740 * We can fall through TCP_KEEPALIVE_THRESHOLD logic after 741 * converting the input to milliseconds. 742 */ 743 case TCP_KEEPIDLE: 744 *i1 *= 1000; 745 /* FALLTHRU */ 746 747 case TCP_KEEPALIVE_THRESHOLD: 748 if (checkonly) 749 break; 750 751 if (*i1 < tcps->tcps_keepalive_interval_low || 752 *i1 > tcps->tcps_keepalive_interval_high) { 753 *outlenp = 0; 754 return (EINVAL); 755 } 756 if (*i1 != tcp->tcp_ka_interval) { 757 tcp->tcp_ka_interval = *i1; 758 /* 759 * Check if we need to restart the 760 * keepalive timer. 761 */ 762 if (tcp->tcp_ka_tid != 0) { 763 ASSERT(connp->conn_keepalive); 764 (void) TCP_TIMER_CANCEL(tcp, 765 tcp->tcp_ka_tid); 766 tcp->tcp_ka_last_intrvl = 0; 767 tcp->tcp_ka_tid = TCP_TIMER(tcp, 768 tcp_keepalive_timer, 769 tcp->tcp_ka_interval); 770 } 771 } 772 break; 773 774 /* 775 * tcp_ka_abort_thres = tcp_ka_rinterval * tcp_ka_cnt. 776 * So setting TCP_KEEPCNT or TCP_KEEPINTVL can affect all the 777 * three members - tcp_ka_abort_thres, tcp_ka_rinterval and 778 * tcp_ka_cnt. 779 */ 780 case TCP_KEEPCNT: 781 if (checkonly) 782 break; 783 784 if (*i1 == 0) { 785 return (EINVAL); 786 } else if (tcp->tcp_ka_rinterval == 0) { 787 /* 788 * When TCP_KEEPCNT is specified without first 789 * specifying a TCP_KEEPINTVL, we infer an 790 * interval based on a tunable specific to our 791 * stack: the tcp_keepalive_abort_interval. 792 * (Or the TCP_KEEPALIVE_ABORT_THRESHOLD, in 793 * the unlikely event that that has been set.) 794 * Given the abort interval's default value of 795 * 480 seconds, low TCP_KEEPCNT values can 796 * result in intervals that exceed the default 797 * maximum RTO of 60 seconds. Rather than 798 * fail in these cases, we (implicitly) clamp 799 * the interval at the maximum RTO; if the 800 * TCP_KEEPCNT is shortly followed by a 801 * TCP_KEEPINTVL (as we expect), the abort 802 * threshold will be recalculated correctly -- 803 * and if a TCP_KEEPINTVL is not forthcoming, 804 * keep-alive will at least operate reasonably 805 * given the underconfigured state. 806 */ 807 uint32_t interval; 808 809 interval = tcp->tcp_ka_abort_thres / *i1; 810 811 if (interval < tcp->tcp_rto_min) 812 interval = tcp->tcp_rto_min; 813 814 if (interval > tcp->tcp_rto_max) 815 interval = tcp->tcp_rto_max; 816 817 tcp->tcp_ka_rinterval = interval; 818 } else { 819 if ((*i1 * tcp->tcp_ka_rinterval) < 820 tcps->tcps_keepalive_abort_interval_low || 821 (*i1 * tcp->tcp_ka_rinterval) > 822 tcps->tcps_keepalive_abort_interval_high) 823 return (EINVAL); 824 tcp->tcp_ka_abort_thres = 825 (*i1 * tcp->tcp_ka_rinterval); 826 } 827 tcp->tcp_ka_cnt = *i1; 828 break; 829 case TCP_KEEPINTVL: 830 /* 831 * TCP_KEEPINTVL is specified in seconds, but 832 * tcp_ka_rinterval is in milliseconds. 833 */ 834 835 if (checkonly) 836 break; 837 838 if ((*i1 * 1000) < tcp->tcp_rto_min || 839 (*i1 * 1000) > tcp->tcp_rto_max) 840 return (EINVAL); 841 842 if (tcp->tcp_ka_cnt == 0) { 843 tcp->tcp_ka_cnt = 844 tcp->tcp_ka_abort_thres / (*i1 * 1000); 845 } else { 846 if ((*i1 * tcp->tcp_ka_cnt * 1000) < 847 tcps->tcps_keepalive_abort_interval_low || 848 (*i1 * tcp->tcp_ka_cnt * 1000) > 849 tcps->tcps_keepalive_abort_interval_high) 850 return (EINVAL); 851 tcp->tcp_ka_abort_thres = 852 (*i1 * tcp->tcp_ka_cnt * 1000); 853 } 854 tcp->tcp_ka_rinterval = *i1 * 1000; 855 break; 856 case TCP_KEEPALIVE_ABORT_THRESHOLD: 857 if (!checkonly) { 858 if (*i1 < 859 tcps->tcps_keepalive_abort_interval_low || 860 *i1 > 861 tcps->tcps_keepalive_abort_interval_high) { 862 *outlenp = 0; 863 return (EINVAL); 864 } 865 tcp->tcp_ka_abort_thres = *i1; 866 tcp->tcp_ka_cnt = 0; 867 tcp->tcp_ka_rinterval = 0; 868 } 869 break; 870 case TCP_CONGESTION: { 871 struct cc_algo *algo; 872 873 if (checkonly) { 874 break; 875 } 876 877 /* 878 * Make sure the string is NUL-terminated. Some 879 * consumers pass only the number of characters 880 * in the string, and don't include the NUL 881 * terminator, so we set it for them. 882 */ 883 if (inlen < CC_ALGO_NAME_MAX) { 884 invalp[inlen] = '\0'; 885 } 886 invalp[CC_ALGO_NAME_MAX - 1] = '\0'; 887 888 if ((algo = cc_load_algo((char *)invalp)) == NULL) { 889 return (ENOENT); 890 } 891 892 if (CC_ALGO(tcp)->cb_destroy != NULL) { 893 CC_ALGO(tcp)->cb_destroy(&tcp->tcp_ccv); 894 } 895 896 CC_DATA(tcp) = NULL; 897 CC_ALGO(tcp) = algo; 898 899 if (CC_ALGO(tcp)->cb_init != NULL) { 900 VERIFY0(CC_ALGO(tcp)->cb_init(&tcp->tcp_ccv)); 901 } 902 903 break; 904 } 905 case TCP_CORK: 906 if (!checkonly) { 907 /* 908 * if tcp->tcp_cork was set and is now 909 * being unset, we have to make sure that 910 * the remaining data gets sent out. Also 911 * unset tcp->tcp_cork so that tcp_wput_data() 912 * can send data even if it is less than mss 913 */ 914 if (tcp->tcp_cork && onoff == 0 && 915 tcp->tcp_unsent > 0) { 916 tcp->tcp_cork = B_FALSE; 917 tcp_wput_data(tcp, NULL, B_FALSE); 918 } 919 tcp->tcp_cork = onoff; 920 } 921 break; 922 case TCP_RTO_INITIAL: 923 if (checkonly || val == 0) 924 break; 925 926 /* 927 * Sanity checks 928 * 929 * The initial RTO should be bounded by the minimum 930 * and maximum RTO. And it should also be smaller 931 * than the connect attempt abort timeout. Otherwise, 932 * the connection won't be aborted in a period 933 * reasonably close to that timeout. 934 */ 935 if (val < tcp->tcp_rto_min || val > tcp->tcp_rto_max || 936 val > tcp->tcp_second_ctimer_threshold || 937 val < tcps->tcps_rexmit_interval_initial_low || 938 val > tcps->tcps_rexmit_interval_initial_high) { 939 *outlenp = 0; 940 return (EINVAL); 941 } 942 tcp->tcp_rto_initial = val; 943 944 /* 945 * If TCP has not sent anything, need to re-calculate 946 * tcp_rto. Otherwise, this option change does not 947 * really affect anything. 948 */ 949 if (tcp->tcp_state >= TCPS_SYN_SENT) 950 break; 951 952 tcp->tcp_rtt_sa = MSEC2NSEC(tcp->tcp_rto_initial) << 2; 953 tcp->tcp_rtt_sd = MSEC2NSEC(tcp->tcp_rto_initial) >> 1; 954 tcp->tcp_rto = tcp_calculate_rto(tcp, tcps, 955 tcps->tcps_conn_grace_period); 956 break; 957 case TCP_RTO_MIN: 958 if (checkonly || val == 0) 959 break; 960 961 if (val < tcps->tcps_rexmit_interval_min_low || 962 val > tcps->tcps_rexmit_interval_min_high || 963 val > tcp->tcp_rto_max) { 964 *outlenp = 0; 965 return (EINVAL); 966 } 967 tcp->tcp_rto_min = val; 968 if (tcp->tcp_rto < val) 969 tcp->tcp_rto = val; 970 break; 971 case TCP_RTO_MAX: 972 if (checkonly || val == 0) 973 break; 974 975 /* 976 * Sanity checks 977 * 978 * The maximum RTO should not be larger than the 979 * connection abort timeout. Otherwise, the 980 * connection won't be aborted in a period reasonably 981 * close to that timeout. 982 */ 983 if (val < tcps->tcps_rexmit_interval_max_low || 984 val > tcps->tcps_rexmit_interval_max_high || 985 val < tcp->tcp_rto_min || 986 val > tcp->tcp_second_timer_threshold) { 987 *outlenp = 0; 988 return (EINVAL); 989 } 990 tcp->tcp_rto_max = val; 991 if (tcp->tcp_rto > val) 992 tcp->tcp_rto = val; 993 break; 994 case TCP_LINGER2: 995 if (checkonly || *i1 == 0) 996 break; 997 998 /* 999 * Note that the option value's unit is second. And 1000 * the value should be bigger than the private 1001 * parameter tcp_fin_wait_2_flush_interval's lower 1002 * bound and smaller than the current value of that 1003 * parameter. It should be smaller than the current 1004 * value to avoid an app setting TCP_LINGER2 to a big 1005 * value, causing resource to be held up too long in 1006 * FIN-WAIT-2 state. 1007 */ 1008 if (*i1 < 0 || 1009 tcps->tcps_fin_wait_2_flush_interval_low/SECONDS > 1010 *i1 || 1011 tcps->tcps_fin_wait_2_flush_interval/SECONDS < 1012 *i1) { 1013 *outlenp = 0; 1014 return (EINVAL); 1015 } 1016 tcp->tcp_fin_wait_2_flush_interval = *i1 * SECONDS; 1017 break; 1018 default: 1019 break; 1020 } 1021 break; 1022 case IPPROTO_IP: 1023 if (connp->conn_family != AF_INET) { 1024 *outlenp = 0; 1025 return (EINVAL); 1026 } 1027 switch (name) { 1028 case IP_SEC_OPT: 1029 /* 1030 * We should not allow policy setting after 1031 * we start listening for connections. 1032 */ 1033 if (tcp->tcp_state == TCPS_LISTEN) { 1034 return (EINVAL); 1035 } 1036 break; 1037 case IP_RECVTOS: 1038 if (!checkonly) { 1039 /* 1040 * Force it to be sent up with the next msg 1041 * by setting it to a value which cannot 1042 * appear in a packet (TOS is only 8-bits) 1043 */ 1044 tcp->tcp_recvtos = 0xffffffffU; 1045 } 1046 break; 1047 } 1048 break; 1049 case IPPROTO_IPV6: 1050 /* 1051 * IPPROTO_IPV6 options are only supported for sockets 1052 * that are using IPv6 on the wire. 1053 */ 1054 if (connp->conn_ipversion != IPV6_VERSION) { 1055 *outlenp = 0; 1056 return (EINVAL); 1057 } 1058 1059 switch (name) { 1060 case IPV6_RECVPKTINFO: 1061 if (!checkonly) { 1062 /* Force it to be sent up with the next msg */ 1063 tcp->tcp_recvifindex = 0; 1064 } 1065 break; 1066 case IPV6_RECVTCLASS: 1067 if (!checkonly) { 1068 /* Force it to be sent up with the next msg */ 1069 tcp->tcp_recvtclass = 0xffffffffU; 1070 } 1071 break; 1072 case IPV6_RECVHOPLIMIT: 1073 if (!checkonly) { 1074 /* Force it to be sent up with the next msg */ 1075 tcp->tcp_recvhops = 0xffffffffU; 1076 } 1077 break; 1078 case IPV6_PKTINFO: 1079 /* This is an extra check for TCP */ 1080 if (inlen == sizeof (struct in6_pktinfo)) { 1081 struct in6_pktinfo *pkti; 1082 1083 pkti = (struct in6_pktinfo *)invalp; 1084 /* 1085 * RFC 3542 states that ipi6_addr must be 1086 * the unspecified address when setting the 1087 * IPV6_PKTINFO sticky socket option on a 1088 * TCP socket. 1089 */ 1090 if (!IN6_IS_ADDR_UNSPECIFIED(&pkti->ipi6_addr)) 1091 return (EINVAL); 1092 } 1093 break; 1094 case IPV6_SEC_OPT: 1095 /* 1096 * We should not allow policy setting after 1097 * we start listening for connections. 1098 */ 1099 if (tcp->tcp_state == TCPS_LISTEN) { 1100 return (EINVAL); 1101 } 1102 break; 1103 } 1104 break; 1105 } 1106 reterr = conn_opt_set(&coas, level, name, inlen, invalp, 1107 checkonly, cr); 1108 if (reterr != 0) { 1109 *outlenp = 0; 1110 return (reterr); 1111 } 1112 1113 /* 1114 * Common case of OK return with outval same as inval 1115 */ 1116 if (invalp != outvalp) { 1117 /* don't trust bcopy for identical src/dst */ 1118 (void) bcopy(invalp, outvalp, inlen); 1119 } 1120 *outlenp = inlen; 1121 1122 if (coas.coa_changed & COA_HEADER_CHANGED) { 1123 /* If we are connected we rebuilt the headers */ 1124 if (!IN6_IS_ADDR_UNSPECIFIED(&connp->conn_faddr_v6) && 1125 !IN6_IS_ADDR_V4MAPPED_ANY(&connp->conn_faddr_v6)) { 1126 reterr = tcp_build_hdrs(tcp); 1127 if (reterr != 0) 1128 return (reterr); 1129 } 1130 } 1131 if (coas.coa_changed & COA_ROUTE_CHANGED) { 1132 in6_addr_t nexthop; 1133 1134 /* 1135 * If we are connected we re-cache the information. 1136 * We ignore errors to preserve BSD behavior. 1137 * Note that we don't redo IPsec policy lookup here 1138 * since the final destination (or source) didn't change. 1139 */ 1140 ip_attr_nexthop(&connp->conn_xmit_ipp, connp->conn_ixa, 1141 &connp->conn_faddr_v6, &nexthop); 1142 1143 if (!IN6_IS_ADDR_UNSPECIFIED(&connp->conn_faddr_v6) && 1144 !IN6_IS_ADDR_V4MAPPED_ANY(&connp->conn_faddr_v6)) { 1145 (void) ip_attr_connect(connp, connp->conn_ixa, 1146 &connp->conn_laddr_v6, &connp->conn_faddr_v6, 1147 &nexthop, connp->conn_fport, NULL, NULL, 1148 IPDF_VERIFY_DST); 1149 } 1150 } 1151 if ((coas.coa_changed & COA_SNDBUF_CHANGED) && !IPCL_IS_NONSTR(connp)) { 1152 connp->conn_wq->q_hiwat = connp->conn_sndbuf; 1153 } 1154 if (coas.coa_changed & COA_WROFF_CHANGED) { 1155 connp->conn_wroff = connp->conn_ht_iphc_allocated + 1156 tcps->tcps_wroff_xtra; 1157 (void) proto_set_tx_wroff(connp->conn_rq, connp, 1158 connp->conn_wroff); 1159 } 1160 if (coas.coa_changed & COA_OOBINLINE_CHANGED) { 1161 if (IPCL_IS_NONSTR(connp)) 1162 proto_set_rx_oob_opt(connp, onoff); 1163 } 1164 return (0); 1165 } 1166