xref: /illumos-gate/usr/src/uts/common/inet/tcp/tcp_opt_data.c (revision e52fb54bb8f22da555df8e240ebd249941b0ed95)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
23  * Copyright (c) 2011 Nexenta Systems, Inc. All rights reserved.
24  */
25 
26 #include <sys/types.h>
27 #include <sys/stream.h>
28 #define	_SUN_TPI_VERSION 2
29 #include <sys/tihdr.h>
30 #include <sys/socket.h>
31 #include <sys/xti_xtiopt.h>
32 #include <sys/xti_inet.h>
33 #include <sys/policy.h>
34 
35 #include <inet/common.h>
36 #include <netinet/ip6.h>
37 #include <inet/ip.h>
38 
39 #include <netinet/in.h>
40 #include <netinet/tcp.h>
41 #include <inet/optcom.h>
42 #include <inet/proto_set.h>
43 #include <inet/tcp_impl.h>
44 
45 static int	tcp_opt_default(queue_t *, int, int, uchar_t *);
46 
47 /*
48  * Table of all known options handled on a TCP protocol stack.
49  *
50  * Note: This table contains options processed by both TCP and IP levels
51  *       and is the superset of options that can be performed on a TCP over IP
52  *       stack.
53  */
54 opdes_t	tcp_opt_arr[] = {
55 
56 { SO_LINGER,	SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0,
57 	sizeof (struct linger), 0 },
58 
59 { SO_DEBUG,	SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
60 { SO_KEEPALIVE,	SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
61 { SO_DONTROUTE,	SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
62 { SO_USELOOPBACK, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0
63 	},
64 { SO_BROADCAST,	SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
65 { SO_REUSEADDR, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
66 { SO_OOBINLINE, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
67 { SO_TYPE,	SOL_SOCKET, OA_R, OA_R, OP_NP, 0, sizeof (int), 0 },
68 { SO_SNDBUF,	SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
69 { SO_RCVBUF,	SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
70 { SO_SNDTIMEO,	SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0,
71 	sizeof (struct timeval), 0 },
72 { SO_RCVTIMEO,	SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0,
73 	sizeof (struct timeval), 0 },
74 { SO_DGRAM_ERRIND, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0
75 	},
76 { SO_SND_COPYAVOID, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
77 { SO_ANON_MLP, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int),
78 	0 },
79 { SO_MAC_EXEMPT, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int),
80 	0 },
81 { SO_MAC_IMPLICIT, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int),
82 	0 },
83 { SO_ALLZONES, SOL_SOCKET, OA_R, OA_RW, OP_CONFIG, 0, sizeof (int),
84 	0 },
85 { SO_EXCLBIND, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
86 
87 { SO_DOMAIN,	SOL_SOCKET, OA_R, OA_R, OP_NP, 0, sizeof (int), 0 },
88 
89 { SO_PROTOTYPE,	SOL_SOCKET, OA_R, OA_R, OP_NP, 0, sizeof (int), 0 },
90 
91 { TCP_NODELAY,	IPPROTO_TCP, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0
92 	},
93 { TCP_MAXSEG,	IPPROTO_TCP, OA_R, OA_R, OP_NP, 0, sizeof (uint_t),
94 	536 },
95 
96 { TCP_NOTIFY_THRESHOLD, IPPROTO_TCP, OA_RW, OA_RW, OP_NP,
97 	OP_DEF_FN, sizeof (int), -1 /* not initialized */ },
98 
99 { TCP_ABORT_THRESHOLD, IPPROTO_TCP, OA_RW, OA_RW, OP_NP,
100 	OP_DEF_FN, sizeof (int), -1 /* not initialized */ },
101 
102 { TCP_CONN_NOTIFY_THRESHOLD, IPPROTO_TCP, OA_RW, OA_RW, OP_NP,
103 	OP_DEF_FN, sizeof (int), -1 /* not initialized */ },
104 
105 { TCP_CONN_ABORT_THRESHOLD, IPPROTO_TCP, OA_RW, OA_RW, OP_NP,
106 	OP_DEF_FN, sizeof (int), -1 /* not initialized */ },
107 
108 { TCP_RECVDSTADDR, IPPROTO_TCP, OA_RW, OA_RW, OP_NP, 0, sizeof (int),
109 	0 },
110 
111 { TCP_ANONPRIVBIND, IPPROTO_TCP, OA_R, OA_RW, OP_PRIVPORT, 0,
112 	sizeof (int), 0 },
113 
114 { TCP_EXCLBIND, IPPROTO_TCP, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0
115 	},
116 
117 { TCP_INIT_CWND, IPPROTO_TCP, OA_RW, OA_RW, OP_CONFIG, 0,
118 	sizeof (int), 0 },
119 
120 { TCP_KEEPALIVE_THRESHOLD, IPPROTO_TCP, OA_RW, OA_RW, OP_NP, 0,
121 	sizeof (int), 0	},
122 
123 { TCP_KEEPIDLE, IPPROTO_TCP, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
124 
125 { TCP_KEEPCNT, IPPROTO_TCP, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
126 
127 { TCP_KEEPINTVL, IPPROTO_TCP, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
128 
129 { TCP_KEEPALIVE_ABORT_THRESHOLD, IPPROTO_TCP, OA_RW, OA_RW, OP_NP, 0,
130 	sizeof (int), 0	},
131 
132 { TCP_CORK, IPPROTO_TCP, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
133 
134 { TCP_RTO_INITIAL, IPPROTO_TCP, OA_RW, OA_RW, OP_NP, 0, sizeof (uint32_t), 0 },
135 
136 { TCP_RTO_MIN, IPPROTO_TCP, OA_RW, OA_RW, OP_NP, 0, sizeof (uint32_t), 0 },
137 
138 { TCP_RTO_MAX, IPPROTO_TCP, OA_RW, OA_RW, OP_NP, 0, sizeof (uint32_t), 0 },
139 
140 { TCP_LINGER2, IPPROTO_TCP, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
141 
142 { IP_OPTIONS,	IPPROTO_IP, OA_RW, OA_RW, OP_NP,
143 	(OP_VARLEN|OP_NODEFAULT),
144 	IP_MAX_OPT_LENGTH + IP_ADDR_LEN, -1 /* not initialized */ },
145 { T_IP_OPTIONS,	IPPROTO_IP, OA_RW, OA_RW, OP_NP,
146 	(OP_VARLEN|OP_NODEFAULT),
147 	IP_MAX_OPT_LENGTH + IP_ADDR_LEN, -1 /* not initialized */ },
148 
149 { IP_TOS,	IPPROTO_IP, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
150 { T_IP_TOS,	IPPROTO_IP, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
151 { IP_TTL,	IPPROTO_IP, OA_RW, OA_RW, OP_NP, OP_DEF_FN,
152 	sizeof (int), -1 /* not initialized */ },
153 
154 { IP_SEC_OPT, IPPROTO_IP, OA_RW, OA_RW, OP_NP, OP_NODEFAULT,
155 	sizeof (ipsec_req_t), -1 /* not initialized */ },
156 
157 { IP_BOUND_IF, IPPROTO_IP, OA_RW, OA_RW, OP_NP, 0,
158 	sizeof (int),	0 /* no ifindex */ },
159 
160 { IP_UNSPEC_SRC, IPPROTO_IP, OA_R, OA_RW, OP_RAW, 0,
161 	sizeof (int), 0 },
162 
163 { IPV6_UNICAST_HOPS, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, OP_DEF_FN,
164 	sizeof (int), -1 /* not initialized */ },
165 
166 { IPV6_BOUND_IF, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 0,
167 	sizeof (int),	0 /* no ifindex */ },
168 
169 { IP_DONTFRAG, IPPROTO_IP, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
170 
171 { IP_NEXTHOP, IPPROTO_IP, OA_R, OA_RW, OP_CONFIG, 0,
172 	sizeof (in_addr_t),	-1 /* not initialized  */ },
173 
174 { IPV6_UNSPEC_SRC, IPPROTO_IPV6, OA_R, OA_RW, OP_RAW, 0,
175 	sizeof (int), 0 },
176 
177 { IPV6_PKTINFO, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP,
178 	(OP_NODEFAULT|OP_VARLEN),
179 	sizeof (struct in6_pktinfo), -1 /* not initialized */ },
180 { IPV6_NEXTHOP, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP,
181 	OP_NODEFAULT,
182 	sizeof (sin6_t), -1 /* not initialized */ },
183 { IPV6_HOPOPTS, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP,
184 	(OP_VARLEN|OP_NODEFAULT), 255*8,
185 	-1 /* not initialized */ },
186 { IPV6_DSTOPTS, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP,
187 	(OP_VARLEN|OP_NODEFAULT), 255*8,
188 	-1 /* not initialized */ },
189 { IPV6_RTHDRDSTOPTS, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP,
190 	(OP_VARLEN|OP_NODEFAULT), 255*8,
191 	-1 /* not initialized */ },
192 { IPV6_RTHDR, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP,
193 	(OP_VARLEN|OP_NODEFAULT), 255*8,
194 	-1 /* not initialized */ },
195 { IPV6_TCLASS, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP,
196 	OP_NODEFAULT,
197 	sizeof (int), -1 /* not initialized */ },
198 { IPV6_PATHMTU, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP,
199 	OP_NODEFAULT,
200 	sizeof (struct ip6_mtuinfo), -1 /* not initialized */ },
201 { IPV6_DONTFRAG, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 0,
202 	sizeof (int), 0 },
203 { IPV6_USE_MIN_MTU, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 0,
204 	sizeof (int), 0 },
205 { IPV6_V6ONLY, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 0,
206 	sizeof (int), 0 },
207 
208 /* Enable receipt of ancillary data */
209 { IPV6_RECVPKTINFO, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 0,
210 	sizeof (int), 0 },
211 { IPV6_RECVHOPLIMIT, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 0,
212 	sizeof (int), 0 },
213 { IPV6_RECVHOPOPTS, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 0,
214 	sizeof (int), 0 },
215 { _OLD_IPV6_RECVDSTOPTS, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 0,
216 	sizeof (int), 0 },
217 { IPV6_RECVDSTOPTS, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 0,
218 	sizeof (int), 0 },
219 { IPV6_RECVRTHDR, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 0,
220 	sizeof (int), 0 },
221 { IPV6_RECVRTHDRDSTOPTS, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 0,
222 	sizeof (int), 0 },
223 { IPV6_RECVTCLASS, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 0,
224 	sizeof (int), 0 },
225 
226 { IPV6_SEC_OPT, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, OP_NODEFAULT,
227 	sizeof (ipsec_req_t), -1 /* not initialized */ },
228 { IPV6_SRC_PREFERENCES, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 0,
229 	sizeof (uint32_t), IPV6_PREFER_SRC_DEFAULT },
230 };
231 
232 /*
233  * Table of all supported levels
234  * Note: Some levels (e.g. XTI_GENERIC) may be valid but may not have
235  * any supported options so we need this info separately.
236  *
237  * This is needed only for topmost tpi providers and is used only by
238  * XTI interfaces.
239  */
240 optlevel_t	tcp_valid_levels_arr[] = {
241 	XTI_GENERIC,
242 	SOL_SOCKET,
243 	IPPROTO_TCP,
244 	IPPROTO_IP,
245 	IPPROTO_IPV6
246 };
247 
248 
249 #define	TCP_OPT_ARR_CNT		A_CNT(tcp_opt_arr)
250 #define	TCP_VALID_LEVELS_CNT	A_CNT(tcp_valid_levels_arr)
251 
252 uint_t tcp_max_optsize; /* initialized when TCP driver is loaded */
253 
254 /*
255  * Initialize option database object for TCP
256  *
257  * This object represents database of options to search passed to
258  * {sock,tpi}optcom_req() interface routine to take care of option
259  * management and associated methods.
260  */
261 
262 optdb_obj_t tcp_opt_obj = {
263 	tcp_opt_default,	/* TCP default value function pointer */
264 	tcp_tpi_opt_get,	/* TCP get function pointer */
265 	tcp_tpi_opt_set,	/* TCP set function pointer */
266 	TCP_OPT_ARR_CNT,	/* TCP option database count of entries */
267 	tcp_opt_arr,		/* TCP option database */
268 	TCP_VALID_LEVELS_CNT,	/* TCP valid level count of entries */
269 	tcp_valid_levels_arr	/* TCP valid level array */
270 };
271 
272 /* Maximum TCP initial cwin (start/restart). */
273 #define	TCP_MAX_INIT_CWND	16
274 
275 static int tcp_max_init_cwnd = TCP_MAX_INIT_CWND;
276 
277 /*
278  * Some TCP options can be "set" by requesting them in the option
279  * buffer. This is needed for XTI feature test though we do not
280  * allow it in general. We interpret that this mechanism is more
281  * applicable to OSI protocols and need not be allowed in general.
282  * This routine filters out options for which it is not allowed (most)
283  * and lets through those (few) for which it is. [ The XTI interface
284  * test suite specifics will imply that any XTI_GENERIC level XTI_* if
285  * ever implemented will have to be allowed here ].
286  */
287 static boolean_t
288 tcp_allow_connopt_set(int level, int name)
289 {
290 
291 	switch (level) {
292 	case IPPROTO_TCP:
293 		switch (name) {
294 		case TCP_NODELAY:
295 			return (B_TRUE);
296 		default:
297 			return (B_FALSE);
298 		}
299 		/*NOTREACHED*/
300 	default:
301 		return (B_FALSE);
302 	}
303 	/*NOTREACHED*/
304 }
305 
306 /*
307  * This routine gets default values of certain options whose default
308  * values are maintained by protocol specific code
309  */
310 /* ARGSUSED */
311 static int
312 tcp_opt_default(queue_t *q, int level, int name, uchar_t *ptr)
313 {
314 	int32_t	*i1 = (int32_t *)ptr;
315 	tcp_stack_t	*tcps = Q_TO_TCP(q)->tcp_tcps;
316 
317 	switch (level) {
318 	case IPPROTO_TCP:
319 		switch (name) {
320 		case TCP_NOTIFY_THRESHOLD:
321 			*i1 = tcps->tcps_ip_notify_interval;
322 			break;
323 		case TCP_ABORT_THRESHOLD:
324 			*i1 = tcps->tcps_ip_abort_interval;
325 			break;
326 		case TCP_CONN_NOTIFY_THRESHOLD:
327 			*i1 = tcps->tcps_ip_notify_cinterval;
328 			break;
329 		case TCP_CONN_ABORT_THRESHOLD:
330 			*i1 = tcps->tcps_ip_abort_cinterval;
331 			break;
332 		default:
333 			return (-1);
334 		}
335 		break;
336 	case IPPROTO_IP:
337 		switch (name) {
338 		case IP_TTL:
339 			*i1 = tcps->tcps_ipv4_ttl;
340 			break;
341 		default:
342 			return (-1);
343 		}
344 		break;
345 	case IPPROTO_IPV6:
346 		switch (name) {
347 		case IPV6_UNICAST_HOPS:
348 			*i1 = tcps->tcps_ipv6_hoplimit;
349 			break;
350 		default:
351 			return (-1);
352 		}
353 		break;
354 	default:
355 		return (-1);
356 	}
357 	return (sizeof (int));
358 }
359 
360 /*
361  * TCP routine to get the values of options.
362  */
363 int
364 tcp_opt_get(conn_t *connp, int level, int name, uchar_t *ptr)
365 {
366 	int		*i1 = (int *)ptr;
367 	tcp_t		*tcp = connp->conn_tcp;
368 	conn_opt_arg_t	coas;
369 	int		retval;
370 
371 	coas.coa_connp = connp;
372 	coas.coa_ixa = connp->conn_ixa;
373 	coas.coa_ipp = &connp->conn_xmit_ipp;
374 	coas.coa_ancillary = B_FALSE;
375 	coas.coa_changed = 0;
376 
377 	switch (level) {
378 	case SOL_SOCKET:
379 		switch (name) {
380 		case SO_SND_COPYAVOID:
381 			*i1 = tcp->tcp_snd_zcopy_on ?
382 			    SO_SND_COPYAVOID : 0;
383 			return (sizeof (int));
384 		case SO_ACCEPTCONN:
385 			*i1 = (tcp->tcp_state == TCPS_LISTEN);
386 			return (sizeof (int));
387 		}
388 		break;
389 	case IPPROTO_TCP:
390 		switch (name) {
391 		case TCP_NODELAY:
392 			*i1 = (tcp->tcp_naglim == 1) ? TCP_NODELAY : 0;
393 			return (sizeof (int));
394 		case TCP_MAXSEG:
395 			*i1 = tcp->tcp_mss;
396 			return (sizeof (int));
397 		case TCP_NOTIFY_THRESHOLD:
398 			*i1 = (int)tcp->tcp_first_timer_threshold;
399 			return (sizeof (int));
400 		case TCP_ABORT_THRESHOLD:
401 			*i1 = tcp->tcp_second_timer_threshold;
402 			return (sizeof (int));
403 		case TCP_CONN_NOTIFY_THRESHOLD:
404 			*i1 = tcp->tcp_first_ctimer_threshold;
405 			return (sizeof (int));
406 		case TCP_CONN_ABORT_THRESHOLD:
407 			*i1 = tcp->tcp_second_ctimer_threshold;
408 			return (sizeof (int));
409 		case TCP_INIT_CWND:
410 			*i1 = tcp->tcp_init_cwnd;
411 			return (sizeof (int));
412 		case TCP_KEEPALIVE_THRESHOLD:
413 			*i1 = tcp->tcp_ka_interval;
414 			return (sizeof (int));
415 
416 		/*
417 		 * TCP_KEEPIDLE expects value in seconds, but
418 		 * tcp_ka_interval is in milliseconds.
419 		 */
420 		case TCP_KEEPIDLE:
421 			*i1 = tcp->tcp_ka_interval / 1000;
422 			return (sizeof (int));
423 		case TCP_KEEPCNT:
424 			*i1 = tcp->tcp_ka_cnt;
425 			return (sizeof (int));
426 
427 		/*
428 		 * TCP_KEEPINTVL expects value in seconds, but
429 		 * tcp_ka_rinterval is in milliseconds.
430 		 */
431 		case TCP_KEEPINTVL:
432 			*i1 = tcp->tcp_ka_rinterval / 1000;
433 			return (sizeof (int));
434 		case TCP_KEEPALIVE_ABORT_THRESHOLD:
435 			*i1 = tcp->tcp_ka_abort_thres;
436 			return (sizeof (int));
437 		case TCP_CORK:
438 			*i1 = tcp->tcp_cork;
439 			return (sizeof (int));
440 		case TCP_RTO_INITIAL:
441 			*i1 = tcp->tcp_rto_initial;
442 			return (sizeof (uint32_t));
443 		case TCP_RTO_MIN:
444 			*i1 = tcp->tcp_rto_min;
445 			return (sizeof (uint32_t));
446 		case TCP_RTO_MAX:
447 			*i1 = tcp->tcp_rto_max;
448 			return (sizeof (uint32_t));
449 		case TCP_LINGER2:
450 			*i1 = tcp->tcp_fin_wait_2_flush_interval / SECONDS;
451 			return (sizeof (int));
452 		}
453 		break;
454 	case IPPROTO_IP:
455 		if (connp->conn_family != AF_INET)
456 			return (-1);
457 		switch (name) {
458 		case IP_OPTIONS:
459 		case T_IP_OPTIONS:
460 			/* Caller ensures enough space */
461 			return (ip_opt_get_user(connp, ptr));
462 		default:
463 			break;
464 		}
465 		break;
466 
467 	case IPPROTO_IPV6:
468 		/*
469 		 * IPPROTO_IPV6 options are only supported for sockets
470 		 * that are using IPv6 on the wire.
471 		 */
472 		if (connp->conn_ipversion != IPV6_VERSION) {
473 			return (-1);
474 		}
475 		switch (name) {
476 		case IPV6_PATHMTU:
477 			if (tcp->tcp_state < TCPS_ESTABLISHED)
478 				return (-1);
479 			break;
480 		}
481 		break;
482 	}
483 	mutex_enter(&connp->conn_lock);
484 	retval = conn_opt_get(&coas, level, name, ptr);
485 	mutex_exit(&connp->conn_lock);
486 	return (retval);
487 }
488 
489 /*
490  * We declare as 'int' rather than 'void' to satisfy pfi_t arg requirements.
491  * Parameters are assumed to be verified by the caller.
492  */
493 /* ARGSUSED */
494 int
495 tcp_opt_set(conn_t *connp, uint_t optset_context, int level, int name,
496     uint_t inlen, uchar_t *invalp, uint_t *outlenp, uchar_t *outvalp,
497     void *thisdg_attrs, cred_t *cr)
498 {
499 	tcp_t	*tcp = connp->conn_tcp;
500 	int	*i1 = (int *)invalp;
501 	boolean_t onoff = (*i1 == 0) ? 0 : 1;
502 	boolean_t checkonly;
503 	int	reterr;
504 	tcp_stack_t	*tcps = tcp->tcp_tcps;
505 	conn_opt_arg_t	coas;
506 	uint32_t	val = *((uint32_t *)invalp);
507 
508 	coas.coa_connp = connp;
509 	coas.coa_ixa = connp->conn_ixa;
510 	coas.coa_ipp = &connp->conn_xmit_ipp;
511 	coas.coa_ancillary = B_FALSE;
512 	coas.coa_changed = 0;
513 
514 	switch (optset_context) {
515 	case SETFN_OPTCOM_CHECKONLY:
516 		checkonly = B_TRUE;
517 		/*
518 		 * Note: Implies T_CHECK semantics for T_OPTCOM_REQ
519 		 * inlen != 0 implies value supplied and
520 		 * 	we have to "pretend" to set it.
521 		 * inlen == 0 implies that there is no
522 		 * 	value part in T_CHECK request and just validation
523 		 * done elsewhere should be enough, we just return here.
524 		 */
525 		if (inlen == 0) {
526 			*outlenp = 0;
527 			return (0);
528 		}
529 		break;
530 	case SETFN_OPTCOM_NEGOTIATE:
531 		checkonly = B_FALSE;
532 		break;
533 	case SETFN_UD_NEGOTIATE: /* error on conn-oriented transports ? */
534 	case SETFN_CONN_NEGOTIATE:
535 		checkonly = B_FALSE;
536 		/*
537 		 * Negotiating local and "association-related" options
538 		 * from other (T_CONN_REQ, T_CONN_RES,T_UNITDATA_REQ)
539 		 * primitives is allowed by XTI, but we choose
540 		 * to not implement this style negotiation for Internet
541 		 * protocols (We interpret it is a must for OSI world but
542 		 * optional for Internet protocols) for all options.
543 		 * [ Will do only for the few options that enable test
544 		 * suites that our XTI implementation of this feature
545 		 * works for transports that do allow it ]
546 		 */
547 		if (!tcp_allow_connopt_set(level, name)) {
548 			*outlenp = 0;
549 			return (EINVAL);
550 		}
551 		break;
552 	default:
553 		/*
554 		 * We should never get here
555 		 */
556 		*outlenp = 0;
557 		return (EINVAL);
558 	}
559 
560 	ASSERT((optset_context != SETFN_OPTCOM_CHECKONLY) ||
561 	    (optset_context == SETFN_OPTCOM_CHECKONLY && inlen != 0));
562 
563 	/*
564 	 * For TCP, we should have no ancillary data sent down
565 	 * (sendmsg isn't supported for SOCK_STREAM), so thisdg_attrs
566 	 * has to be zero.
567 	 */
568 	ASSERT(thisdg_attrs == NULL);
569 
570 	/*
571 	 * For fixed length options, no sanity check
572 	 * of passed in length is done. It is assumed *_optcom_req()
573 	 * routines do the right thing.
574 	 */
575 	switch (level) {
576 	case SOL_SOCKET:
577 		switch (name) {
578 		case SO_KEEPALIVE:
579 			if (checkonly) {
580 				/* check only case */
581 				break;
582 			}
583 
584 			if (!onoff) {
585 				if (connp->conn_keepalive) {
586 					if (tcp->tcp_ka_tid != 0) {
587 						(void) TCP_TIMER_CANCEL(tcp,
588 						    tcp->tcp_ka_tid);
589 						tcp->tcp_ka_tid = 0;
590 					}
591 					connp->conn_keepalive = 0;
592 				}
593 				break;
594 			}
595 			if (!connp->conn_keepalive) {
596 				/* Crank up the keepalive timer */
597 				tcp->tcp_ka_last_intrvl = 0;
598 				tcp->tcp_ka_tid = TCP_TIMER(tcp,
599 				    tcp_keepalive_timer, tcp->tcp_ka_interval);
600 				connp->conn_keepalive = 1;
601 			}
602 			break;
603 		case SO_SNDBUF: {
604 			if (*i1 > tcps->tcps_max_buf) {
605 				*outlenp = 0;
606 				return (ENOBUFS);
607 			}
608 			if (checkonly)
609 				break;
610 
611 			connp->conn_sndbuf = *i1;
612 			if (tcps->tcps_snd_lowat_fraction != 0) {
613 				connp->conn_sndlowat = connp->conn_sndbuf /
614 				    tcps->tcps_snd_lowat_fraction;
615 			}
616 			(void) tcp_maxpsz_set(tcp, B_TRUE);
617 			/*
618 			 * If we are flow-controlled, recheck the condition.
619 			 * There are apps that increase SO_SNDBUF size when
620 			 * flow-controlled (EWOULDBLOCK), and expect the flow
621 			 * control condition to be lifted right away.
622 			 */
623 			mutex_enter(&tcp->tcp_non_sq_lock);
624 			if (tcp->tcp_flow_stopped &&
625 			    TCP_UNSENT_BYTES(tcp) < connp->conn_sndbuf) {
626 				tcp_clrqfull(tcp);
627 			}
628 			mutex_exit(&tcp->tcp_non_sq_lock);
629 			*outlenp = inlen;
630 			return (0);
631 		}
632 		case SO_RCVBUF:
633 			if (*i1 > tcps->tcps_max_buf) {
634 				*outlenp = 0;
635 				return (ENOBUFS);
636 			}
637 			/* Silently ignore zero */
638 			if (!checkonly && *i1 != 0) {
639 				*i1 = MSS_ROUNDUP(*i1, tcp->tcp_mss);
640 				(void) tcp_rwnd_set(tcp, *i1);
641 			}
642 			/*
643 			 * XXX should we return the rwnd here
644 			 * and tcp_opt_get ?
645 			 */
646 			*outlenp = inlen;
647 			return (0);
648 		case SO_SND_COPYAVOID:
649 			if (!checkonly) {
650 				if (tcp->tcp_loopback ||
651 				    (onoff != 1) || !tcp_zcopy_check(tcp)) {
652 					*outlenp = 0;
653 					return (EOPNOTSUPP);
654 				}
655 				tcp->tcp_snd_zcopy_aware = 1;
656 			}
657 			*outlenp = inlen;
658 			return (0);
659 		}
660 		break;
661 	case IPPROTO_TCP:
662 		switch (name) {
663 		case TCP_NODELAY:
664 			if (!checkonly)
665 				tcp->tcp_naglim = *i1 ? 1 : tcp->tcp_mss;
666 			break;
667 		case TCP_NOTIFY_THRESHOLD:
668 			if (!checkonly)
669 				tcp->tcp_first_timer_threshold = *i1;
670 			break;
671 		case TCP_ABORT_THRESHOLD:
672 			if (!checkonly)
673 				tcp->tcp_second_timer_threshold = *i1;
674 			break;
675 		case TCP_CONN_NOTIFY_THRESHOLD:
676 			if (!checkonly)
677 				tcp->tcp_first_ctimer_threshold = *i1;
678 			break;
679 		case TCP_CONN_ABORT_THRESHOLD:
680 			if (!checkonly)
681 				tcp->tcp_second_ctimer_threshold = *i1;
682 			break;
683 		case TCP_RECVDSTADDR:
684 			if (tcp->tcp_state > TCPS_LISTEN) {
685 				*outlenp = 0;
686 				return (EOPNOTSUPP);
687 			}
688 			/* Setting done in conn_opt_set */
689 			break;
690 		case TCP_INIT_CWND:
691 			if (checkonly)
692 				break;
693 
694 			/*
695 			 * Only allow socket with network configuration
696 			 * privilege to set the initial cwnd to be larger
697 			 * than allowed by RFC 3390.
698 			 */
699 			if (val > MIN(4, MAX(2, 4380 / tcp->tcp_mss))) {
700 				if ((reterr = secpolicy_ip_config(cr, B_TRUE))
701 				    != 0) {
702 					*outlenp = 0;
703 					return (reterr);
704 				}
705 				if (val > tcp_max_init_cwnd) {
706 					*outlenp = 0;
707 					return (EINVAL);
708 				}
709 			}
710 
711 			tcp->tcp_init_cwnd = val;
712 
713 			/*
714 			 * If the socket is connected, AND no outbound data
715 			 * has been sent, reset the actual cwnd values.
716 			 */
717 			if (tcp->tcp_state == TCPS_ESTABLISHED &&
718 			    tcp->tcp_iss == tcp->tcp_snxt - 1) {
719 				tcp->tcp_cwnd =
720 				    MIN(tcp->tcp_rwnd, val * tcp->tcp_mss);
721 			}
722 			break;
723 
724 		/*
725 		 * TCP_KEEPIDLE is in seconds but TCP_KEEPALIVE_THRESHOLD
726 		 * is in milliseconds. TCP_KEEPIDLE is introduced for
727 		 * compatibility with other Unix flavors.
728 		 * We can fall through TCP_KEEPALIVE_THRESHOLD logic after
729 		 * converting the input to milliseconds.
730 		 */
731 		case TCP_KEEPIDLE:
732 			*i1 *= 1000;
733 			/* FALLTHRU */
734 
735 		case TCP_KEEPALIVE_THRESHOLD:
736 			if (checkonly)
737 				break;
738 
739 			if (*i1 < tcps->tcps_keepalive_interval_low ||
740 			    *i1 > tcps->tcps_keepalive_interval_high) {
741 				*outlenp = 0;
742 				return (EINVAL);
743 			}
744 			if (*i1 != tcp->tcp_ka_interval) {
745 				tcp->tcp_ka_interval = *i1;
746 				/*
747 				 * Check if we need to restart the
748 				 * keepalive timer.
749 				 */
750 				if (tcp->tcp_ka_tid != 0) {
751 					ASSERT(connp->conn_keepalive);
752 					(void) TCP_TIMER_CANCEL(tcp,
753 					    tcp->tcp_ka_tid);
754 					tcp->tcp_ka_last_intrvl = 0;
755 					tcp->tcp_ka_tid = TCP_TIMER(tcp,
756 					    tcp_keepalive_timer,
757 					    tcp->tcp_ka_interval);
758 				}
759 			}
760 			break;
761 
762 		/*
763 		 * tcp_ka_abort_thres = tcp_ka_rinterval * tcp_ka_cnt.
764 		 * So setting TCP_KEEPCNT or TCP_KEEPINTVL can affect all the
765 		 * three members - tcp_ka_abort_thres, tcp_ka_rinterval and
766 		 * tcp_ka_cnt.
767 		 */
768 		case TCP_KEEPCNT:
769 			if (checkonly)
770 				break;
771 
772 			if (*i1 == 0) {
773 				return (EINVAL);
774 			} else if (tcp->tcp_ka_rinterval == 0) {
775 				if ((tcp->tcp_ka_abort_thres / *i1) <
776 				    tcp->tcp_rto_min ||
777 				    (tcp->tcp_ka_abort_thres / *i1) >
778 				    tcp->tcp_rto_max)
779 					return (EINVAL);
780 
781 				tcp->tcp_ka_rinterval =
782 				    tcp->tcp_ka_abort_thres / *i1;
783 			} else {
784 				if ((*i1 * tcp->tcp_ka_rinterval) <
785 				    tcps->tcps_keepalive_abort_interval_low ||
786 				    (*i1 * tcp->tcp_ka_rinterval) >
787 				    tcps->tcps_keepalive_abort_interval_high)
788 					return (EINVAL);
789 				tcp->tcp_ka_abort_thres =
790 				    (*i1 * tcp->tcp_ka_rinterval);
791 			}
792 			tcp->tcp_ka_cnt = *i1;
793 			break;
794 		case TCP_KEEPINTVL:
795 			/*
796 			 * TCP_KEEPINTVL is specified in seconds, but
797 			 * tcp_ka_rinterval is in milliseconds.
798 			 */
799 
800 			if (checkonly)
801 				break;
802 
803 			if ((*i1 * 1000) < tcp->tcp_rto_min ||
804 			    (*i1 * 1000) > tcp->tcp_rto_max)
805 				return (EINVAL);
806 
807 			if (tcp->tcp_ka_cnt == 0) {
808 				tcp->tcp_ka_cnt =
809 				    tcp->tcp_ka_abort_thres / (*i1 * 1000);
810 			} else {
811 				if ((*i1 * tcp->tcp_ka_cnt * 1000) <
812 				    tcps->tcps_keepalive_abort_interval_low ||
813 				    (*i1 * tcp->tcp_ka_cnt * 1000) >
814 				    tcps->tcps_keepalive_abort_interval_high)
815 					return (EINVAL);
816 				tcp->tcp_ka_abort_thres =
817 				    (*i1 * tcp->tcp_ka_cnt * 1000);
818 			}
819 			tcp->tcp_ka_rinterval = *i1 * 1000;
820 			break;
821 		case TCP_KEEPALIVE_ABORT_THRESHOLD:
822 			if (!checkonly) {
823 				if (*i1 <
824 				    tcps->tcps_keepalive_abort_interval_low ||
825 				    *i1 >
826 				    tcps->tcps_keepalive_abort_interval_high) {
827 					*outlenp = 0;
828 					return (EINVAL);
829 				}
830 				tcp->tcp_ka_abort_thres = *i1;
831 				tcp->tcp_ka_cnt = 0;
832 				tcp->tcp_ka_rinterval = 0;
833 			}
834 			break;
835 		case TCP_CORK:
836 			if (!checkonly) {
837 				/*
838 				 * if tcp->tcp_cork was set and is now
839 				 * being unset, we have to make sure that
840 				 * the remaining data gets sent out. Also
841 				 * unset tcp->tcp_cork so that tcp_wput_data()
842 				 * can send data even if it is less than mss
843 				 */
844 				if (tcp->tcp_cork && onoff == 0 &&
845 				    tcp->tcp_unsent > 0) {
846 					tcp->tcp_cork = B_FALSE;
847 					tcp_wput_data(tcp, NULL, B_FALSE);
848 				}
849 				tcp->tcp_cork = onoff;
850 			}
851 			break;
852 		case TCP_RTO_INITIAL: {
853 			clock_t rto;
854 
855 			if (checkonly || val == 0)
856 				break;
857 
858 			/*
859 			 * Sanity checks
860 			 *
861 			 * The initial RTO should be bounded by the minimum
862 			 * and maximum RTO.  And it should also be smaller
863 			 * than the connect attempt abort timeout.  Otherwise,
864 			 * the connection won't be aborted in a period
865 			 * reasonably close to that timeout.
866 			 */
867 			if (val < tcp->tcp_rto_min || val > tcp->tcp_rto_max ||
868 			    val > tcp->tcp_second_ctimer_threshold ||
869 			    val < tcps->tcps_rexmit_interval_initial_low ||
870 			    val > tcps->tcps_rexmit_interval_initial_high) {
871 				*outlenp = 0;
872 				return (EINVAL);
873 			}
874 			tcp->tcp_rto_initial = val;
875 
876 			/*
877 			 * If TCP has not sent anything, need to re-calculate
878 			 * tcp_rto.  Otherwise, this option change does not
879 			 * really affect anything.
880 			 */
881 			if (tcp->tcp_state >= TCPS_SYN_SENT)
882 				break;
883 
884 			tcp->tcp_rtt_sa = tcp->tcp_rto_initial << 2;
885 			tcp->tcp_rtt_sd = tcp->tcp_rto_initial >> 1;
886 			rto = (tcp->tcp_rtt_sa >> 3) + tcp->tcp_rtt_sd +
887 			    tcps->tcps_rexmit_interval_extra +
888 			    (tcp->tcp_rtt_sa >> 5) +
889 			    tcps->tcps_conn_grace_period;
890 			TCP_SET_RTO(tcp, rto);
891 			break;
892 		}
893 		case TCP_RTO_MIN:
894 			if (checkonly || val == 0)
895 				break;
896 
897 			if (val < tcps->tcps_rexmit_interval_min_low ||
898 			    val > tcps->tcps_rexmit_interval_min_high ||
899 			    val > tcp->tcp_rto_max) {
900 				*outlenp = 0;
901 				return (EINVAL);
902 			}
903 			tcp->tcp_rto_min = val;
904 			if (tcp->tcp_rto < val)
905 				tcp->tcp_rto = val;
906 			break;
907 		case TCP_RTO_MAX:
908 			if (checkonly || val == 0)
909 				break;
910 
911 			/*
912 			 * Sanity checks
913 			 *
914 			 * The maximum RTO should not be larger than the
915 			 * connection abort timeout.  Otherwise, the
916 			 * connection won't be aborted in a period reasonably
917 			 * close to that timeout.
918 			 */
919 			if (val < tcps->tcps_rexmit_interval_max_low ||
920 			    val > tcps->tcps_rexmit_interval_max_high ||
921 			    val < tcp->tcp_rto_min ||
922 			    val > tcp->tcp_second_timer_threshold) {
923 				*outlenp = 0;
924 				return (EINVAL);
925 			}
926 			tcp->tcp_rto_max = val;
927 			if (tcp->tcp_rto > val)
928 				tcp->tcp_rto = val;
929 			break;
930 		case TCP_LINGER2:
931 			if (checkonly || *i1 == 0)
932 				break;
933 
934 			/*
935 			 * Note that the option value's unit is second.  And
936 			 * the value should be bigger than the private
937 			 * parameter tcp_fin_wait_2_flush_interval's lower
938 			 * bound and smaller than the current value of that
939 			 * parameter.  It should be smaller than the current
940 			 * value to avoid an app setting TCP_LINGER2 to a big
941 			 * value, causing resource to be held up too long in
942 			 * FIN-WAIT-2 state.
943 			 */
944 			if (*i1 < 0 ||
945 			    tcps->tcps_fin_wait_2_flush_interval_low/SECONDS >
946 			    *i1 ||
947 			    tcps->tcps_fin_wait_2_flush_interval/SECONDS <
948 			    *i1) {
949 				*outlenp = 0;
950 				return (EINVAL);
951 			}
952 			tcp->tcp_fin_wait_2_flush_interval = *i1 * SECONDS;
953 			break;
954 		default:
955 			break;
956 		}
957 		break;
958 	case IPPROTO_IP:
959 		if (connp->conn_family != AF_INET) {
960 			*outlenp = 0;
961 			return (EINVAL);
962 		}
963 		switch (name) {
964 		case IP_SEC_OPT:
965 			/*
966 			 * We should not allow policy setting after
967 			 * we start listening for connections.
968 			 */
969 			if (tcp->tcp_state == TCPS_LISTEN) {
970 				return (EINVAL);
971 			}
972 			break;
973 		}
974 		break;
975 	case IPPROTO_IPV6:
976 		/*
977 		 * IPPROTO_IPV6 options are only supported for sockets
978 		 * that are using IPv6 on the wire.
979 		 */
980 		if (connp->conn_ipversion != IPV6_VERSION) {
981 			*outlenp = 0;
982 			return (EINVAL);
983 		}
984 
985 		switch (name) {
986 		case IPV6_RECVPKTINFO:
987 			if (!checkonly) {
988 				/* Force it to be sent up with the next msg */
989 				tcp->tcp_recvifindex = 0;
990 			}
991 			break;
992 		case IPV6_RECVTCLASS:
993 			if (!checkonly) {
994 				/* Force it to be sent up with the next msg */
995 				tcp->tcp_recvtclass = 0xffffffffU;
996 			}
997 			break;
998 		case IPV6_RECVHOPLIMIT:
999 			if (!checkonly) {
1000 				/* Force it to be sent up with the next msg */
1001 				tcp->tcp_recvhops = 0xffffffffU;
1002 			}
1003 			break;
1004 		case IPV6_PKTINFO:
1005 			/* This is an extra check for TCP */
1006 			if (inlen == sizeof (struct in6_pktinfo)) {
1007 				struct in6_pktinfo *pkti;
1008 
1009 				pkti = (struct in6_pktinfo *)invalp;
1010 				/*
1011 				 * RFC 3542 states that ipi6_addr must be
1012 				 * the unspecified address when setting the
1013 				 * IPV6_PKTINFO sticky socket option on a
1014 				 * TCP socket.
1015 				 */
1016 				if (!IN6_IS_ADDR_UNSPECIFIED(&pkti->ipi6_addr))
1017 					return (EINVAL);
1018 			}
1019 			break;
1020 		case IPV6_SEC_OPT:
1021 			/*
1022 			 * We should not allow policy setting after
1023 			 * we start listening for connections.
1024 			 */
1025 			if (tcp->tcp_state == TCPS_LISTEN) {
1026 				return (EINVAL);
1027 			}
1028 			break;
1029 		}
1030 		break;
1031 	}
1032 	reterr = conn_opt_set(&coas, level, name, inlen, invalp,
1033 	    checkonly, cr);
1034 	if (reterr != 0) {
1035 		*outlenp = 0;
1036 		return (reterr);
1037 	}
1038 
1039 	/*
1040 	 * Common case of OK return with outval same as inval
1041 	 */
1042 	if (invalp != outvalp) {
1043 		/* don't trust bcopy for identical src/dst */
1044 		(void) bcopy(invalp, outvalp, inlen);
1045 	}
1046 	*outlenp = inlen;
1047 
1048 	if (coas.coa_changed & COA_HEADER_CHANGED) {
1049 		/* If we are connected we rebuilt the headers */
1050 		if (!IN6_IS_ADDR_UNSPECIFIED(&connp->conn_faddr_v6) &&
1051 		    !IN6_IS_ADDR_V4MAPPED_ANY(&connp->conn_faddr_v6)) {
1052 			reterr = tcp_build_hdrs(tcp);
1053 			if (reterr != 0)
1054 				return (reterr);
1055 		}
1056 	}
1057 	if (coas.coa_changed & COA_ROUTE_CHANGED) {
1058 		in6_addr_t nexthop;
1059 
1060 		/*
1061 		 * If we are connected we re-cache the information.
1062 		 * We ignore errors to preserve BSD behavior.
1063 		 * Note that we don't redo IPsec policy lookup here
1064 		 * since the final destination (or source) didn't change.
1065 		 */
1066 		ip_attr_nexthop(&connp->conn_xmit_ipp, connp->conn_ixa,
1067 		    &connp->conn_faddr_v6, &nexthop);
1068 
1069 		if (!IN6_IS_ADDR_UNSPECIFIED(&connp->conn_faddr_v6) &&
1070 		    !IN6_IS_ADDR_V4MAPPED_ANY(&connp->conn_faddr_v6)) {
1071 			(void) ip_attr_connect(connp, connp->conn_ixa,
1072 			    &connp->conn_laddr_v6, &connp->conn_faddr_v6,
1073 			    &nexthop, connp->conn_fport, NULL, NULL,
1074 			    IPDF_VERIFY_DST);
1075 		}
1076 	}
1077 	if ((coas.coa_changed & COA_SNDBUF_CHANGED) && !IPCL_IS_NONSTR(connp)) {
1078 		connp->conn_wq->q_hiwat = connp->conn_sndbuf;
1079 	}
1080 	if (coas.coa_changed & COA_WROFF_CHANGED) {
1081 		connp->conn_wroff = connp->conn_ht_iphc_allocated +
1082 		    tcps->tcps_wroff_xtra;
1083 		(void) proto_set_tx_wroff(connp->conn_rq, connp,
1084 		    connp->conn_wroff);
1085 	}
1086 	if (coas.coa_changed & COA_OOBINLINE_CHANGED) {
1087 		if (IPCL_IS_NONSTR(connp))
1088 			proto_set_rx_oob_opt(connp, onoff);
1089 	}
1090 	return (0);
1091 }
1092