1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved. 23 * Copyright (c) 2011 Nexenta Systems, Inc. All rights reserved. 24 * Copyright 2019 Joyent, Inc. 25 * Copyright (c) 2016 by Delphix. All rights reserved. 26 */ 27 28 #include <sys/types.h> 29 #include <sys/stream.h> 30 #define _SUN_TPI_VERSION 2 31 #include <sys/tihdr.h> 32 #include <sys/socket.h> 33 #include <sys/xti_xtiopt.h> 34 #include <sys/xti_inet.h> 35 #include <sys/policy.h> 36 37 #include <inet/cc.h> 38 #include <inet/common.h> 39 #include <netinet/ip6.h> 40 #include <inet/ip.h> 41 42 #include <netinet/in.h> 43 #include <netinet/tcp.h> 44 #include <inet/optcom.h> 45 #include <inet/proto_set.h> 46 #include <inet/tcp_impl.h> 47 48 static int tcp_opt_default(queue_t *, int, int, uchar_t *); 49 50 /* 51 * Table of all known options handled on a TCP protocol stack. 52 * 53 * Note: This table contains options processed by both TCP and IP levels 54 * and is the superset of options that can be performed on a TCP over IP 55 * stack. 56 */ 57 opdes_t tcp_opt_arr[] = { 58 59 { SO_LINGER, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, 60 sizeof (struct linger), 0 }, 61 62 { SO_DEBUG, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 }, 63 { SO_KEEPALIVE, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 }, 64 { SO_DONTROUTE, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 }, 65 { SO_USELOOPBACK, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 66 }, 67 { SO_BROADCAST, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 }, 68 { SO_REUSEADDR, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 }, 69 { SO_OOBINLINE, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 }, 70 { SO_TYPE, SOL_SOCKET, OA_R, OA_R, OP_NP, 0, sizeof (int), 0 }, 71 { SO_SNDBUF, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 }, 72 { SO_RCVBUF, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 }, 73 { SO_SNDTIMEO, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, 74 sizeof (struct timeval), 0 }, 75 { SO_RCVTIMEO, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, 76 sizeof (struct timeval), 0 }, 77 { SO_DGRAM_ERRIND, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 78 }, 79 { SO_SND_COPYAVOID, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 }, 80 { SO_ANON_MLP, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 81 0 }, 82 { SO_MAC_EXEMPT, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 83 0 }, 84 { SO_MAC_IMPLICIT, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 85 0 }, 86 { SO_ALLZONES, SOL_SOCKET, OA_R, OA_RW, OP_CONFIG, 0, sizeof (int), 87 0 }, 88 { SO_EXCLBIND, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 }, 89 90 { SO_DOMAIN, SOL_SOCKET, OA_R, OA_R, OP_NP, 0, sizeof (int), 0 }, 91 92 { SO_PROTOTYPE, SOL_SOCKET, OA_R, OA_R, OP_NP, 0, sizeof (int), 0 }, 93 94 { TCP_NODELAY, IPPROTO_TCP, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 95 }, 96 { TCP_MAXSEG, IPPROTO_TCP, OA_R, OA_R, OP_NP, 0, sizeof (uint_t), 97 536 }, 98 99 { TCP_NOTIFY_THRESHOLD, IPPROTO_TCP, OA_RW, OA_RW, OP_NP, 100 OP_DEF_FN, sizeof (int), -1 /* not initialized */ }, 101 102 { TCP_ABORT_THRESHOLD, IPPROTO_TCP, OA_RW, OA_RW, OP_NP, 103 OP_DEF_FN, sizeof (int), -1 /* not initialized */ }, 104 105 { TCP_CONN_NOTIFY_THRESHOLD, IPPROTO_TCP, OA_RW, OA_RW, OP_NP, 106 OP_DEF_FN, sizeof (int), -1 /* not initialized */ }, 107 108 { TCP_CONN_ABORT_THRESHOLD, IPPROTO_TCP, OA_RW, OA_RW, OP_NP, 109 OP_DEF_FN, sizeof (int), -1 /* not initialized */ }, 110 111 { TCP_RECVDSTADDR, IPPROTO_TCP, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 112 0 }, 113 114 { TCP_ANONPRIVBIND, IPPROTO_TCP, OA_R, OA_RW, OP_PRIVPORT, 0, 115 sizeof (int), 0 }, 116 117 { TCP_EXCLBIND, IPPROTO_TCP, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 118 }, 119 120 { TCP_INIT_CWND, IPPROTO_TCP, OA_RW, OA_RW, OP_CONFIG, 0, 121 sizeof (int), 0 }, 122 123 { TCP_KEEPALIVE_THRESHOLD, IPPROTO_TCP, OA_RW, OA_RW, OP_NP, 0, 124 sizeof (int), 0 }, 125 126 { TCP_KEEPIDLE, IPPROTO_TCP, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 }, 127 128 { TCP_KEEPCNT, IPPROTO_TCP, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 }, 129 130 { TCP_KEEPINTVL, IPPROTO_TCP, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 }, 131 132 { TCP_KEEPALIVE_ABORT_THRESHOLD, IPPROTO_TCP, OA_RW, OA_RW, OP_NP, 0, 133 sizeof (int), 0 }, 134 135 { TCP_CORK, IPPROTO_TCP, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 }, 136 137 { TCP_RTO_INITIAL, IPPROTO_TCP, OA_RW, OA_RW, OP_NP, 0, sizeof (uint32_t), 0 }, 138 139 { TCP_RTO_MIN, IPPROTO_TCP, OA_RW, OA_RW, OP_NP, 0, sizeof (uint32_t), 0 }, 140 141 { TCP_RTO_MAX, IPPROTO_TCP, OA_RW, OA_RW, OP_NP, 0, sizeof (uint32_t), 0 }, 142 143 { TCP_LINGER2, IPPROTO_TCP, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 }, 144 145 { TCP_CONGESTION, IPPROTO_TCP, OA_RW, OA_RW, OP_NP, 146 OP_VARLEN, CC_ALGO_NAME_MAX, 0 }, 147 148 { IP_OPTIONS, IPPROTO_IP, OA_RW, OA_RW, OP_NP, 149 (OP_VARLEN|OP_NODEFAULT), 150 IP_MAX_OPT_LENGTH + IP_ADDR_LEN, -1 /* not initialized */ }, 151 { T_IP_OPTIONS, IPPROTO_IP, OA_RW, OA_RW, OP_NP, 152 (OP_VARLEN|OP_NODEFAULT), 153 IP_MAX_OPT_LENGTH + IP_ADDR_LEN, -1 /* not initialized */ }, 154 155 { IP_TOS, IPPROTO_IP, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 }, 156 { T_IP_TOS, IPPROTO_IP, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 }, 157 { IP_TTL, IPPROTO_IP, OA_RW, OA_RW, OP_NP, OP_DEF_FN, 158 sizeof (int), -1 /* not initialized */ }, 159 160 { IP_SEC_OPT, IPPROTO_IP, OA_RW, OA_RW, OP_NP, OP_NODEFAULT, 161 sizeof (ipsec_req_t), -1 /* not initialized */ }, 162 163 { IP_BOUND_IF, IPPROTO_IP, OA_RW, OA_RW, OP_NP, 0, 164 sizeof (int), 0 /* no ifindex */ }, 165 166 { IP_UNSPEC_SRC, IPPROTO_IP, OA_R, OA_RW, OP_RAW, 0, 167 sizeof (int), 0 }, 168 169 { IPV6_UNICAST_HOPS, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, OP_DEF_FN, 170 sizeof (int), -1 /* not initialized */ }, 171 172 { IPV6_BOUND_IF, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 0, 173 sizeof (int), 0 /* no ifindex */ }, 174 175 { IP_DONTFRAG, IPPROTO_IP, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 }, 176 177 { IP_NEXTHOP, IPPROTO_IP, OA_R, OA_RW, OP_CONFIG, 0, 178 sizeof (in_addr_t), -1 /* not initialized */ }, 179 180 { IPV6_UNSPEC_SRC, IPPROTO_IPV6, OA_R, OA_RW, OP_RAW, 0, 181 sizeof (int), 0 }, 182 183 { IPV6_PKTINFO, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 184 (OP_NODEFAULT|OP_VARLEN), 185 sizeof (struct in6_pktinfo), -1 /* not initialized */ }, 186 { IPV6_NEXTHOP, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 187 OP_NODEFAULT, 188 sizeof (sin6_t), -1 /* not initialized */ }, 189 { IPV6_HOPOPTS, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 190 (OP_VARLEN|OP_NODEFAULT), 255*8, 191 -1 /* not initialized */ }, 192 { IPV6_DSTOPTS, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 193 (OP_VARLEN|OP_NODEFAULT), 255*8, 194 -1 /* not initialized */ }, 195 { IPV6_RTHDRDSTOPTS, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 196 (OP_VARLEN|OP_NODEFAULT), 255*8, 197 -1 /* not initialized */ }, 198 { IPV6_RTHDR, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 199 (OP_VARLEN|OP_NODEFAULT), 255*8, 200 -1 /* not initialized */ }, 201 { IPV6_TCLASS, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 202 OP_NODEFAULT, 203 sizeof (int), -1 /* not initialized */ }, 204 { IPV6_PATHMTU, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 205 OP_NODEFAULT, 206 sizeof (struct ip6_mtuinfo), -1 /* not initialized */ }, 207 { IPV6_DONTFRAG, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 0, 208 sizeof (int), 0 }, 209 { IPV6_USE_MIN_MTU, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 0, 210 sizeof (int), 0 }, 211 { IPV6_V6ONLY, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 0, 212 sizeof (int), 0 }, 213 214 /* Enable receipt of ancillary data */ 215 { IPV6_RECVPKTINFO, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 0, 216 sizeof (int), 0 }, 217 { IPV6_RECVHOPLIMIT, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 0, 218 sizeof (int), 0 }, 219 { IPV6_RECVHOPOPTS, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 0, 220 sizeof (int), 0 }, 221 { _OLD_IPV6_RECVDSTOPTS, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 0, 222 sizeof (int), 0 }, 223 { IPV6_RECVDSTOPTS, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 0, 224 sizeof (int), 0 }, 225 { IPV6_RECVRTHDR, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 0, 226 sizeof (int), 0 }, 227 { IPV6_RECVRTHDRDSTOPTS, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 0, 228 sizeof (int), 0 }, 229 { IPV6_RECVTCLASS, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 0, 230 sizeof (int), 0 }, 231 232 { IPV6_SEC_OPT, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, OP_NODEFAULT, 233 sizeof (ipsec_req_t), -1 /* not initialized */ }, 234 { IPV6_SRC_PREFERENCES, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 0, 235 sizeof (uint32_t), IPV6_PREFER_SRC_DEFAULT }, 236 }; 237 238 /* 239 * Table of all supported levels 240 * Note: Some levels (e.g. XTI_GENERIC) may be valid but may not have 241 * any supported options so we need this info separately. 242 * 243 * This is needed only for topmost tpi providers and is used only by 244 * XTI interfaces. 245 */ 246 optlevel_t tcp_valid_levels_arr[] = { 247 XTI_GENERIC, 248 SOL_SOCKET, 249 IPPROTO_TCP, 250 IPPROTO_IP, 251 IPPROTO_IPV6 252 }; 253 254 255 #define TCP_OPT_ARR_CNT A_CNT(tcp_opt_arr) 256 #define TCP_VALID_LEVELS_CNT A_CNT(tcp_valid_levels_arr) 257 258 uint_t tcp_max_optsize; /* initialized when TCP driver is loaded */ 259 260 /* 261 * Initialize option database object for TCP 262 * 263 * This object represents database of options to search passed to 264 * {sock,tpi}optcom_req() interface routine to take care of option 265 * management and associated methods. 266 */ 267 268 optdb_obj_t tcp_opt_obj = { 269 tcp_opt_default, /* TCP default value function pointer */ 270 tcp_tpi_opt_get, /* TCP get function pointer */ 271 tcp_tpi_opt_set, /* TCP set function pointer */ 272 TCP_OPT_ARR_CNT, /* TCP option database count of entries */ 273 tcp_opt_arr, /* TCP option database */ 274 TCP_VALID_LEVELS_CNT, /* TCP valid level count of entries */ 275 tcp_valid_levels_arr /* TCP valid level array */ 276 }; 277 278 static int tcp_max_init_cwnd = TCP_MAX_INIT_CWND; 279 280 /* 281 * Some TCP options can be "set" by requesting them in the option 282 * buffer. This is needed for XTI feature test though we do not 283 * allow it in general. We interpret that this mechanism is more 284 * applicable to OSI protocols and need not be allowed in general. 285 * This routine filters out options for which it is not allowed (most) 286 * and lets through those (few) for which it is. [ The XTI interface 287 * test suite specifics will imply that any XTI_GENERIC level XTI_* if 288 * ever implemented will have to be allowed here ]. 289 */ 290 static boolean_t 291 tcp_allow_connopt_set(int level, int name) 292 { 293 294 switch (level) { 295 case IPPROTO_TCP: 296 switch (name) { 297 case TCP_NODELAY: 298 return (B_TRUE); 299 default: 300 return (B_FALSE); 301 } 302 /*NOTREACHED*/ 303 default: 304 return (B_FALSE); 305 } 306 /*NOTREACHED*/ 307 } 308 309 /* 310 * This routine gets default values of certain options whose default 311 * values are maintained by protocol specific code 312 */ 313 /* ARGSUSED */ 314 static int 315 tcp_opt_default(queue_t *q, int level, int name, uchar_t *ptr) 316 { 317 int32_t *i1 = (int32_t *)ptr; 318 tcp_stack_t *tcps = Q_TO_TCP(q)->tcp_tcps; 319 320 switch (level) { 321 case IPPROTO_TCP: 322 switch (name) { 323 case TCP_NOTIFY_THRESHOLD: 324 *i1 = tcps->tcps_ip_notify_interval; 325 break; 326 case TCP_ABORT_THRESHOLD: 327 *i1 = tcps->tcps_ip_abort_interval; 328 break; 329 case TCP_CONN_NOTIFY_THRESHOLD: 330 *i1 = tcps->tcps_ip_notify_cinterval; 331 break; 332 case TCP_CONN_ABORT_THRESHOLD: 333 *i1 = tcps->tcps_ip_abort_cinterval; 334 break; 335 default: 336 return (-1); 337 } 338 break; 339 case IPPROTO_IP: 340 switch (name) { 341 case IP_TTL: 342 *i1 = tcps->tcps_ipv4_ttl; 343 break; 344 default: 345 return (-1); 346 } 347 break; 348 case IPPROTO_IPV6: 349 switch (name) { 350 case IPV6_UNICAST_HOPS: 351 *i1 = tcps->tcps_ipv6_hoplimit; 352 break; 353 default: 354 return (-1); 355 } 356 break; 357 default: 358 return (-1); 359 } 360 return (sizeof (int)); 361 } 362 363 /* 364 * TCP routine to get the values of options. 365 */ 366 int 367 tcp_opt_get(conn_t *connp, int level, int name, uchar_t *ptr) 368 { 369 int *i1 = (int *)ptr; 370 tcp_t *tcp = connp->conn_tcp; 371 conn_opt_arg_t coas; 372 int retval; 373 374 coas.coa_connp = connp; 375 coas.coa_ixa = connp->conn_ixa; 376 coas.coa_ipp = &connp->conn_xmit_ipp; 377 coas.coa_ancillary = B_FALSE; 378 coas.coa_changed = 0; 379 380 switch (level) { 381 case SOL_SOCKET: 382 switch (name) { 383 case SO_SND_COPYAVOID: 384 *i1 = tcp->tcp_snd_zcopy_on ? 385 SO_SND_COPYAVOID : 0; 386 return (sizeof (int)); 387 case SO_ACCEPTCONN: 388 *i1 = (tcp->tcp_state == TCPS_LISTEN); 389 return (sizeof (int)); 390 } 391 break; 392 case IPPROTO_TCP: 393 switch (name) { 394 case TCP_NODELAY: 395 *i1 = (tcp->tcp_naglim == 1) ? TCP_NODELAY : 0; 396 return (sizeof (int)); 397 case TCP_MAXSEG: 398 *i1 = tcp->tcp_mss; 399 return (sizeof (int)); 400 case TCP_NOTIFY_THRESHOLD: 401 *i1 = (int)tcp->tcp_first_timer_threshold; 402 return (sizeof (int)); 403 case TCP_ABORT_THRESHOLD: 404 *i1 = tcp->tcp_second_timer_threshold; 405 return (sizeof (int)); 406 case TCP_CONN_NOTIFY_THRESHOLD: 407 *i1 = tcp->tcp_first_ctimer_threshold; 408 return (sizeof (int)); 409 case TCP_CONN_ABORT_THRESHOLD: 410 *i1 = tcp->tcp_second_ctimer_threshold; 411 return (sizeof (int)); 412 case TCP_INIT_CWND: 413 *i1 = tcp->tcp_init_cwnd; 414 return (sizeof (int)); 415 case TCP_KEEPALIVE_THRESHOLD: 416 *i1 = tcp->tcp_ka_interval; 417 return (sizeof (int)); 418 419 /* 420 * TCP_KEEPIDLE expects value in seconds, but 421 * tcp_ka_interval is in milliseconds. 422 */ 423 case TCP_KEEPIDLE: 424 *i1 = tcp->tcp_ka_interval / 1000; 425 return (sizeof (int)); 426 case TCP_KEEPCNT: 427 *i1 = tcp->tcp_ka_cnt; 428 return (sizeof (int)); 429 430 /* 431 * TCP_KEEPINTVL expects value in seconds, but 432 * tcp_ka_rinterval is in milliseconds. 433 */ 434 case TCP_KEEPINTVL: 435 *i1 = tcp->tcp_ka_rinterval / 1000; 436 return (sizeof (int)); 437 case TCP_KEEPALIVE_ABORT_THRESHOLD: 438 *i1 = tcp->tcp_ka_abort_thres; 439 return (sizeof (int)); 440 case TCP_CONGESTION: { 441 size_t len = strlcpy((char *)ptr, CC_ALGO(tcp)->name, 442 CC_ALGO_NAME_MAX); 443 if (len >= CC_ALGO_NAME_MAX) 444 return (-1); 445 return (len + 1); 446 } 447 case TCP_CORK: 448 *i1 = tcp->tcp_cork; 449 return (sizeof (int)); 450 case TCP_RTO_INITIAL: 451 *i1 = tcp->tcp_rto_initial; 452 return (sizeof (uint32_t)); 453 case TCP_RTO_MIN: 454 *i1 = tcp->tcp_rto_min; 455 return (sizeof (uint32_t)); 456 case TCP_RTO_MAX: 457 *i1 = tcp->tcp_rto_max; 458 return (sizeof (uint32_t)); 459 case TCP_LINGER2: 460 *i1 = tcp->tcp_fin_wait_2_flush_interval / SECONDS; 461 return (sizeof (int)); 462 } 463 break; 464 case IPPROTO_IP: 465 if (connp->conn_family != AF_INET) 466 return (-1); 467 switch (name) { 468 case IP_OPTIONS: 469 case T_IP_OPTIONS: 470 /* Caller ensures enough space */ 471 return (ip_opt_get_user(connp, ptr)); 472 default: 473 break; 474 } 475 break; 476 477 case IPPROTO_IPV6: 478 /* 479 * IPPROTO_IPV6 options are only supported for sockets 480 * that are using IPv6 on the wire. 481 */ 482 if (connp->conn_ipversion != IPV6_VERSION) { 483 return (-1); 484 } 485 switch (name) { 486 case IPV6_PATHMTU: 487 if (tcp->tcp_state < TCPS_ESTABLISHED) 488 return (-1); 489 break; 490 } 491 break; 492 } 493 mutex_enter(&connp->conn_lock); 494 retval = conn_opt_get(&coas, level, name, ptr); 495 mutex_exit(&connp->conn_lock); 496 return (retval); 497 } 498 499 /* 500 * We declare as 'int' rather than 'void' to satisfy pfi_t arg requirements. 501 * Parameters are assumed to be verified by the caller. 502 */ 503 /* ARGSUSED */ 504 int 505 tcp_opt_set(conn_t *connp, uint_t optset_context, int level, int name, 506 uint_t inlen, uchar_t *invalp, uint_t *outlenp, uchar_t *outvalp, 507 void *thisdg_attrs, cred_t *cr) 508 { 509 tcp_t *tcp = connp->conn_tcp; 510 int *i1 = (int *)invalp; 511 boolean_t onoff = (*i1 == 0) ? 0 : 1; 512 boolean_t checkonly; 513 int reterr; 514 tcp_stack_t *tcps = tcp->tcp_tcps; 515 conn_opt_arg_t coas; 516 uint32_t val = *((uint32_t *)invalp); 517 518 coas.coa_connp = connp; 519 coas.coa_ixa = connp->conn_ixa; 520 coas.coa_ipp = &connp->conn_xmit_ipp; 521 coas.coa_ancillary = B_FALSE; 522 coas.coa_changed = 0; 523 524 switch (optset_context) { 525 case SETFN_OPTCOM_CHECKONLY: 526 checkonly = B_TRUE; 527 /* 528 * Note: Implies T_CHECK semantics for T_OPTCOM_REQ 529 * inlen != 0 implies value supplied and 530 * we have to "pretend" to set it. 531 * inlen == 0 implies that there is no 532 * value part in T_CHECK request and just validation 533 * done elsewhere should be enough, we just return here. 534 */ 535 if (inlen == 0) { 536 *outlenp = 0; 537 return (0); 538 } 539 break; 540 case SETFN_OPTCOM_NEGOTIATE: 541 checkonly = B_FALSE; 542 break; 543 case SETFN_UD_NEGOTIATE: /* error on conn-oriented transports ? */ 544 case SETFN_CONN_NEGOTIATE: 545 checkonly = B_FALSE; 546 /* 547 * Negotiating local and "association-related" options 548 * from other (T_CONN_REQ, T_CONN_RES,T_UNITDATA_REQ) 549 * primitives is allowed by XTI, but we choose 550 * to not implement this style negotiation for Internet 551 * protocols (We interpret it is a must for OSI world but 552 * optional for Internet protocols) for all options. 553 * [ Will do only for the few options that enable test 554 * suites that our XTI implementation of this feature 555 * works for transports that do allow it ] 556 */ 557 if (!tcp_allow_connopt_set(level, name)) { 558 *outlenp = 0; 559 return (EINVAL); 560 } 561 break; 562 default: 563 /* 564 * We should never get here 565 */ 566 *outlenp = 0; 567 return (EINVAL); 568 } 569 570 ASSERT((optset_context != SETFN_OPTCOM_CHECKONLY) || 571 (optset_context == SETFN_OPTCOM_CHECKONLY && inlen != 0)); 572 573 /* 574 * For TCP, we should have no ancillary data sent down 575 * (sendmsg isn't supported for SOCK_STREAM), so thisdg_attrs 576 * has to be zero. 577 */ 578 ASSERT(thisdg_attrs == NULL); 579 580 /* 581 * For fixed length options, no sanity check 582 * of passed in length is done. It is assumed *_optcom_req() 583 * routines do the right thing. 584 */ 585 switch (level) { 586 case SOL_SOCKET: 587 switch (name) { 588 case SO_KEEPALIVE: 589 if (checkonly) { 590 /* check only case */ 591 break; 592 } 593 594 if (!onoff) { 595 if (connp->conn_keepalive) { 596 if (tcp->tcp_ka_tid != 0) { 597 (void) TCP_TIMER_CANCEL(tcp, 598 tcp->tcp_ka_tid); 599 tcp->tcp_ka_tid = 0; 600 } 601 connp->conn_keepalive = 0; 602 } 603 break; 604 } 605 if (!connp->conn_keepalive) { 606 /* Crank up the keepalive timer */ 607 tcp->tcp_ka_last_intrvl = 0; 608 tcp->tcp_ka_tid = TCP_TIMER(tcp, 609 tcp_keepalive_timer, tcp->tcp_ka_interval); 610 connp->conn_keepalive = 1; 611 } 612 break; 613 case SO_SNDBUF: { 614 if (*i1 > tcps->tcps_max_buf) { 615 *outlenp = 0; 616 return (ENOBUFS); 617 } 618 if (checkonly) 619 break; 620 621 connp->conn_sndbuf = *i1; 622 if (tcps->tcps_snd_lowat_fraction != 0) { 623 connp->conn_sndlowat = connp->conn_sndbuf / 624 tcps->tcps_snd_lowat_fraction; 625 } 626 (void) tcp_maxpsz_set(tcp, B_TRUE); 627 /* 628 * If we are flow-controlled, recheck the condition. 629 * There are apps that increase SO_SNDBUF size when 630 * flow-controlled (EWOULDBLOCK), and expect the flow 631 * control condition to be lifted right away. 632 */ 633 mutex_enter(&tcp->tcp_non_sq_lock); 634 if (tcp->tcp_flow_stopped && 635 TCP_UNSENT_BYTES(tcp) < connp->conn_sndbuf) { 636 tcp_clrqfull(tcp); 637 } 638 mutex_exit(&tcp->tcp_non_sq_lock); 639 *outlenp = inlen; 640 return (0); 641 } 642 case SO_RCVBUF: 643 if (*i1 > tcps->tcps_max_buf) { 644 *outlenp = 0; 645 return (ENOBUFS); 646 } 647 /* Silently ignore zero */ 648 if (!checkonly && *i1 != 0) { 649 *i1 = MSS_ROUNDUP(*i1, tcp->tcp_mss); 650 (void) tcp_rwnd_set(tcp, *i1); 651 } 652 /* 653 * XXX should we return the rwnd here 654 * and tcp_opt_get ? 655 */ 656 *outlenp = inlen; 657 return (0); 658 case SO_SND_COPYAVOID: 659 if (!checkonly) { 660 if (tcp->tcp_loopback || 661 (onoff != 1) || !tcp_zcopy_check(tcp)) { 662 *outlenp = 0; 663 return (EOPNOTSUPP); 664 } 665 tcp->tcp_snd_zcopy_aware = 1; 666 } 667 *outlenp = inlen; 668 return (0); 669 } 670 break; 671 case IPPROTO_TCP: 672 switch (name) { 673 case TCP_NODELAY: 674 if (!checkonly) 675 tcp->tcp_naglim = *i1 ? 1 : tcp->tcp_mss; 676 break; 677 case TCP_NOTIFY_THRESHOLD: 678 if (!checkonly) 679 tcp->tcp_first_timer_threshold = *i1; 680 break; 681 case TCP_ABORT_THRESHOLD: 682 if (!checkonly) 683 tcp->tcp_second_timer_threshold = *i1; 684 break; 685 case TCP_CONN_NOTIFY_THRESHOLD: 686 if (!checkonly) 687 tcp->tcp_first_ctimer_threshold = *i1; 688 break; 689 case TCP_CONN_ABORT_THRESHOLD: 690 if (!checkonly) 691 tcp->tcp_second_ctimer_threshold = *i1; 692 break; 693 case TCP_RECVDSTADDR: 694 if (tcp->tcp_state > TCPS_LISTEN) { 695 *outlenp = 0; 696 return (EOPNOTSUPP); 697 } 698 /* Setting done in conn_opt_set */ 699 break; 700 case TCP_INIT_CWND: 701 if (checkonly) 702 break; 703 704 /* 705 * Only allow socket with network configuration 706 * privilege to set the initial cwnd to be larger 707 * than allowed by RFC 3390. 708 */ 709 if (val > MIN(4, MAX(2, 4380 / tcp->tcp_mss))) { 710 if ((reterr = secpolicy_ip_config(cr, B_TRUE)) 711 != 0) { 712 *outlenp = 0; 713 return (reterr); 714 } 715 if (val > tcp_max_init_cwnd) { 716 *outlenp = 0; 717 return (EINVAL); 718 } 719 } 720 721 tcp->tcp_init_cwnd = val; 722 723 /* 724 * If the socket is connected, AND no outbound data 725 * has been sent, reset the actual cwnd values. 726 */ 727 if (tcp->tcp_state == TCPS_ESTABLISHED && 728 tcp->tcp_iss == tcp->tcp_snxt - 1) { 729 tcp->tcp_cwnd = 730 MIN(tcp->tcp_rwnd, val * tcp->tcp_mss); 731 } 732 break; 733 734 /* 735 * TCP_KEEPIDLE is in seconds but TCP_KEEPALIVE_THRESHOLD 736 * is in milliseconds. TCP_KEEPIDLE is introduced for 737 * compatibility with other Unix flavors. 738 * We can fall through TCP_KEEPALIVE_THRESHOLD logic after 739 * converting the input to milliseconds. 740 */ 741 case TCP_KEEPIDLE: 742 *i1 *= 1000; 743 /* FALLTHRU */ 744 745 case TCP_KEEPALIVE_THRESHOLD: 746 if (checkonly) 747 break; 748 749 if (*i1 < tcps->tcps_keepalive_interval_low || 750 *i1 > tcps->tcps_keepalive_interval_high) { 751 *outlenp = 0; 752 return (EINVAL); 753 } 754 if (*i1 != tcp->tcp_ka_interval) { 755 tcp->tcp_ka_interval = *i1; 756 /* 757 * Check if we need to restart the 758 * keepalive timer. 759 */ 760 if (tcp->tcp_ka_tid != 0) { 761 ASSERT(connp->conn_keepalive); 762 (void) TCP_TIMER_CANCEL(tcp, 763 tcp->tcp_ka_tid); 764 tcp->tcp_ka_last_intrvl = 0; 765 tcp->tcp_ka_tid = TCP_TIMER(tcp, 766 tcp_keepalive_timer, 767 tcp->tcp_ka_interval); 768 } 769 } 770 break; 771 772 /* 773 * tcp_ka_abort_thres = tcp_ka_rinterval * tcp_ka_cnt. 774 * So setting TCP_KEEPCNT or TCP_KEEPINTVL can affect all the 775 * three members - tcp_ka_abort_thres, tcp_ka_rinterval and 776 * tcp_ka_cnt. 777 */ 778 case TCP_KEEPCNT: 779 if (checkonly) 780 break; 781 782 if (*i1 == 0) { 783 return (EINVAL); 784 } else if (tcp->tcp_ka_rinterval == 0) { 785 /* 786 * When TCP_KEEPCNT is specified without first 787 * specifying a TCP_KEEPINTVL, we infer an 788 * interval based on a tunable specific to our 789 * stack: the tcp_keepalive_abort_interval. 790 * (Or the TCP_KEEPALIVE_ABORT_THRESHOLD, in 791 * the unlikely event that that has been set.) 792 * Given the abort interval's default value of 793 * 480 seconds, low TCP_KEEPCNT values can 794 * result in intervals that exceed the default 795 * maximum RTO of 60 seconds. Rather than 796 * fail in these cases, we (implicitly) clamp 797 * the interval at the maximum RTO; if the 798 * TCP_KEEPCNT is shortly followed by a 799 * TCP_KEEPINTVL (as we expect), the abort 800 * threshold will be recalculated correctly -- 801 * and if a TCP_KEEPINTVL is not forthcoming, 802 * keep-alive will at least operate reasonably 803 * given the underconfigured state. 804 */ 805 uint32_t interval; 806 807 interval = tcp->tcp_ka_abort_thres / *i1; 808 809 if (interval < tcp->tcp_rto_min) 810 interval = tcp->tcp_rto_min; 811 812 if (interval > tcp->tcp_rto_max) 813 interval = tcp->tcp_rto_max; 814 815 tcp->tcp_ka_rinterval = interval; 816 } else { 817 if ((*i1 * tcp->tcp_ka_rinterval) < 818 tcps->tcps_keepalive_abort_interval_low || 819 (*i1 * tcp->tcp_ka_rinterval) > 820 tcps->tcps_keepalive_abort_interval_high) 821 return (EINVAL); 822 tcp->tcp_ka_abort_thres = 823 (*i1 * tcp->tcp_ka_rinterval); 824 } 825 tcp->tcp_ka_cnt = *i1; 826 break; 827 case TCP_KEEPINTVL: 828 /* 829 * TCP_KEEPINTVL is specified in seconds, but 830 * tcp_ka_rinterval is in milliseconds. 831 */ 832 833 if (checkonly) 834 break; 835 836 if ((*i1 * 1000) < tcp->tcp_rto_min || 837 (*i1 * 1000) > tcp->tcp_rto_max) 838 return (EINVAL); 839 840 if (tcp->tcp_ka_cnt == 0) { 841 tcp->tcp_ka_cnt = 842 tcp->tcp_ka_abort_thres / (*i1 * 1000); 843 } else { 844 if ((*i1 * tcp->tcp_ka_cnt * 1000) < 845 tcps->tcps_keepalive_abort_interval_low || 846 (*i1 * tcp->tcp_ka_cnt * 1000) > 847 tcps->tcps_keepalive_abort_interval_high) 848 return (EINVAL); 849 tcp->tcp_ka_abort_thres = 850 (*i1 * tcp->tcp_ka_cnt * 1000); 851 } 852 tcp->tcp_ka_rinterval = *i1 * 1000; 853 break; 854 case TCP_KEEPALIVE_ABORT_THRESHOLD: 855 if (!checkonly) { 856 if (*i1 < 857 tcps->tcps_keepalive_abort_interval_low || 858 *i1 > 859 tcps->tcps_keepalive_abort_interval_high) { 860 *outlenp = 0; 861 return (EINVAL); 862 } 863 tcp->tcp_ka_abort_thres = *i1; 864 tcp->tcp_ka_cnt = 0; 865 tcp->tcp_ka_rinterval = 0; 866 } 867 break; 868 case TCP_CONGESTION: { 869 struct cc_algo *algo; 870 871 if (checkonly) { 872 break; 873 } 874 875 /* 876 * Make sure the string is NUL-terminated. Some 877 * consumers pass only the number of characters 878 * in the string, and don't include the NUL 879 * terminator, so we set it for them. 880 */ 881 if (inlen < CC_ALGO_NAME_MAX) { 882 invalp[inlen] = '\0'; 883 } 884 invalp[CC_ALGO_NAME_MAX - 1] = '\0'; 885 886 if ((algo = cc_load_algo((char *)invalp)) == NULL) { 887 return (ENOENT); 888 } 889 890 if (CC_ALGO(tcp)->cb_destroy != NULL) { 891 CC_ALGO(tcp)->cb_destroy(&tcp->tcp_ccv); 892 } 893 894 CC_DATA(tcp) = NULL; 895 CC_ALGO(tcp) = algo; 896 897 if (CC_ALGO(tcp)->cb_init != NULL) { 898 VERIFY0(CC_ALGO(tcp)->cb_init(&tcp->tcp_ccv)); 899 } 900 901 break; 902 } 903 case TCP_CORK: 904 if (!checkonly) { 905 /* 906 * if tcp->tcp_cork was set and is now 907 * being unset, we have to make sure that 908 * the remaining data gets sent out. Also 909 * unset tcp->tcp_cork so that tcp_wput_data() 910 * can send data even if it is less than mss 911 */ 912 if (tcp->tcp_cork && onoff == 0 && 913 tcp->tcp_unsent > 0) { 914 tcp->tcp_cork = B_FALSE; 915 tcp_wput_data(tcp, NULL, B_FALSE); 916 } 917 tcp->tcp_cork = onoff; 918 } 919 break; 920 case TCP_RTO_INITIAL: 921 if (checkonly || val == 0) 922 break; 923 924 /* 925 * Sanity checks 926 * 927 * The initial RTO should be bounded by the minimum 928 * and maximum RTO. And it should also be smaller 929 * than the connect attempt abort timeout. Otherwise, 930 * the connection won't be aborted in a period 931 * reasonably close to that timeout. 932 */ 933 if (val < tcp->tcp_rto_min || val > tcp->tcp_rto_max || 934 val > tcp->tcp_second_ctimer_threshold || 935 val < tcps->tcps_rexmit_interval_initial_low || 936 val > tcps->tcps_rexmit_interval_initial_high) { 937 *outlenp = 0; 938 return (EINVAL); 939 } 940 tcp->tcp_rto_initial = val; 941 942 /* 943 * If TCP has not sent anything, need to re-calculate 944 * tcp_rto. Otherwise, this option change does not 945 * really affect anything. 946 */ 947 if (tcp->tcp_state >= TCPS_SYN_SENT) 948 break; 949 950 tcp->tcp_rtt_sa = MSEC2NSEC(tcp->tcp_rto_initial) << 2; 951 tcp->tcp_rtt_sd = MSEC2NSEC(tcp->tcp_rto_initial) >> 1; 952 tcp->tcp_rto = tcp_calculate_rto(tcp, tcps, 953 tcps->tcps_conn_grace_period); 954 break; 955 case TCP_RTO_MIN: 956 if (checkonly || val == 0) 957 break; 958 959 if (val < tcps->tcps_rexmit_interval_min_low || 960 val > tcps->tcps_rexmit_interval_min_high || 961 val > tcp->tcp_rto_max) { 962 *outlenp = 0; 963 return (EINVAL); 964 } 965 tcp->tcp_rto_min = val; 966 if (tcp->tcp_rto < val) 967 tcp->tcp_rto = val; 968 break; 969 case TCP_RTO_MAX: 970 if (checkonly || val == 0) 971 break; 972 973 /* 974 * Sanity checks 975 * 976 * The maximum RTO should not be larger than the 977 * connection abort timeout. Otherwise, the 978 * connection won't be aborted in a period reasonably 979 * close to that timeout. 980 */ 981 if (val < tcps->tcps_rexmit_interval_max_low || 982 val > tcps->tcps_rexmit_interval_max_high || 983 val < tcp->tcp_rto_min || 984 val > tcp->tcp_second_timer_threshold) { 985 *outlenp = 0; 986 return (EINVAL); 987 } 988 tcp->tcp_rto_max = val; 989 if (tcp->tcp_rto > val) 990 tcp->tcp_rto = val; 991 break; 992 case TCP_LINGER2: 993 if (checkonly || *i1 == 0) 994 break; 995 996 /* 997 * Note that the option value's unit is second. And 998 * the value should be bigger than the private 999 * parameter tcp_fin_wait_2_flush_interval's lower 1000 * bound and smaller than the current value of that 1001 * parameter. It should be smaller than the current 1002 * value to avoid an app setting TCP_LINGER2 to a big 1003 * value, causing resource to be held up too long in 1004 * FIN-WAIT-2 state. 1005 */ 1006 if (*i1 < 0 || 1007 tcps->tcps_fin_wait_2_flush_interval_low/SECONDS > 1008 *i1 || 1009 tcps->tcps_fin_wait_2_flush_interval/SECONDS < 1010 *i1) { 1011 *outlenp = 0; 1012 return (EINVAL); 1013 } 1014 tcp->tcp_fin_wait_2_flush_interval = *i1 * SECONDS; 1015 break; 1016 default: 1017 break; 1018 } 1019 break; 1020 case IPPROTO_IP: 1021 if (connp->conn_family != AF_INET) { 1022 *outlenp = 0; 1023 return (EINVAL); 1024 } 1025 switch (name) { 1026 case IP_SEC_OPT: 1027 /* 1028 * We should not allow policy setting after 1029 * we start listening for connections. 1030 */ 1031 if (tcp->tcp_state == TCPS_LISTEN) { 1032 return (EINVAL); 1033 } 1034 break; 1035 } 1036 break; 1037 case IPPROTO_IPV6: 1038 /* 1039 * IPPROTO_IPV6 options are only supported for sockets 1040 * that are using IPv6 on the wire. 1041 */ 1042 if (connp->conn_ipversion != IPV6_VERSION) { 1043 *outlenp = 0; 1044 return (EINVAL); 1045 } 1046 1047 switch (name) { 1048 case IPV6_RECVPKTINFO: 1049 if (!checkonly) { 1050 /* Force it to be sent up with the next msg */ 1051 tcp->tcp_recvifindex = 0; 1052 } 1053 break; 1054 case IPV6_RECVTCLASS: 1055 if (!checkonly) { 1056 /* Force it to be sent up with the next msg */ 1057 tcp->tcp_recvtclass = 0xffffffffU; 1058 } 1059 break; 1060 case IPV6_RECVHOPLIMIT: 1061 if (!checkonly) { 1062 /* Force it to be sent up with the next msg */ 1063 tcp->tcp_recvhops = 0xffffffffU; 1064 } 1065 break; 1066 case IPV6_PKTINFO: 1067 /* This is an extra check for TCP */ 1068 if (inlen == sizeof (struct in6_pktinfo)) { 1069 struct in6_pktinfo *pkti; 1070 1071 pkti = (struct in6_pktinfo *)invalp; 1072 /* 1073 * RFC 3542 states that ipi6_addr must be 1074 * the unspecified address when setting the 1075 * IPV6_PKTINFO sticky socket option on a 1076 * TCP socket. 1077 */ 1078 if (!IN6_IS_ADDR_UNSPECIFIED(&pkti->ipi6_addr)) 1079 return (EINVAL); 1080 } 1081 break; 1082 case IPV6_SEC_OPT: 1083 /* 1084 * We should not allow policy setting after 1085 * we start listening for connections. 1086 */ 1087 if (tcp->tcp_state == TCPS_LISTEN) { 1088 return (EINVAL); 1089 } 1090 break; 1091 } 1092 break; 1093 } 1094 reterr = conn_opt_set(&coas, level, name, inlen, invalp, 1095 checkonly, cr); 1096 if (reterr != 0) { 1097 *outlenp = 0; 1098 return (reterr); 1099 } 1100 1101 /* 1102 * Common case of OK return with outval same as inval 1103 */ 1104 if (invalp != outvalp) { 1105 /* don't trust bcopy for identical src/dst */ 1106 (void) bcopy(invalp, outvalp, inlen); 1107 } 1108 *outlenp = inlen; 1109 1110 if (coas.coa_changed & COA_HEADER_CHANGED) { 1111 /* If we are connected we rebuilt the headers */ 1112 if (!IN6_IS_ADDR_UNSPECIFIED(&connp->conn_faddr_v6) && 1113 !IN6_IS_ADDR_V4MAPPED_ANY(&connp->conn_faddr_v6)) { 1114 reterr = tcp_build_hdrs(tcp); 1115 if (reterr != 0) 1116 return (reterr); 1117 } 1118 } 1119 if (coas.coa_changed & COA_ROUTE_CHANGED) { 1120 in6_addr_t nexthop; 1121 1122 /* 1123 * If we are connected we re-cache the information. 1124 * We ignore errors to preserve BSD behavior. 1125 * Note that we don't redo IPsec policy lookup here 1126 * since the final destination (or source) didn't change. 1127 */ 1128 ip_attr_nexthop(&connp->conn_xmit_ipp, connp->conn_ixa, 1129 &connp->conn_faddr_v6, &nexthop); 1130 1131 if (!IN6_IS_ADDR_UNSPECIFIED(&connp->conn_faddr_v6) && 1132 !IN6_IS_ADDR_V4MAPPED_ANY(&connp->conn_faddr_v6)) { 1133 (void) ip_attr_connect(connp, connp->conn_ixa, 1134 &connp->conn_laddr_v6, &connp->conn_faddr_v6, 1135 &nexthop, connp->conn_fport, NULL, NULL, 1136 IPDF_VERIFY_DST); 1137 } 1138 } 1139 if ((coas.coa_changed & COA_SNDBUF_CHANGED) && !IPCL_IS_NONSTR(connp)) { 1140 connp->conn_wq->q_hiwat = connp->conn_sndbuf; 1141 } 1142 if (coas.coa_changed & COA_WROFF_CHANGED) { 1143 connp->conn_wroff = connp->conn_ht_iphc_allocated + 1144 tcps->tcps_wroff_xtra; 1145 (void) proto_set_tx_wroff(connp->conn_rq, connp, 1146 connp->conn_wroff); 1147 } 1148 if (coas.coa_changed & COA_OOBINLINE_CHANGED) { 1149 if (IPCL_IS_NONSTR(connp)) 1150 proto_set_rx_oob_opt(connp, onoff); 1151 } 1152 return (0); 1153 } 1154