xref: /illumos-gate/usr/src/uts/common/inet/tcp/tcp_opt_data.c (revision 8b713775314bbbf24edd503b4869342d8711ce95)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
23  * Copyright (c) 2011 Nexenta Systems, Inc. All rights reserved.
24  */
25 
26 #include <sys/types.h>
27 #include <sys/stream.h>
28 #define	_SUN_TPI_VERSION 2
29 #include <sys/tihdr.h>
30 #include <sys/socket.h>
31 #include <sys/xti_xtiopt.h>
32 #include <sys/xti_inet.h>
33 #include <sys/policy.h>
34 
35 #include <inet/common.h>
36 #include <netinet/ip6.h>
37 #include <inet/ip.h>
38 
39 #include <netinet/in.h>
40 #include <netinet/tcp.h>
41 #include <inet/optcom.h>
42 #include <inet/proto_set.h>
43 #include <inet/tcp_impl.h>
44 
45 static int	tcp_opt_default(queue_t *, int, int, uchar_t *);
46 
47 /*
48  * Table of all known options handled on a TCP protocol stack.
49  *
50  * Note: This table contains options processed by both TCP and IP levels
51  *       and is the superset of options that can be performed on a TCP over IP
52  *       stack.
53  */
54 opdes_t	tcp_opt_arr[] = {
55 
56 { SO_LINGER,	SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0,
57 	sizeof (struct linger), 0 },
58 
59 { SO_DEBUG,	SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
60 { SO_KEEPALIVE,	SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
61 { SO_DONTROUTE,	SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
62 { SO_USELOOPBACK, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0
63 	},
64 { SO_BROADCAST,	SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
65 { SO_REUSEADDR, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
66 { SO_OOBINLINE, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
67 { SO_TYPE,	SOL_SOCKET, OA_R, OA_R, OP_NP, 0, sizeof (int), 0 },
68 { SO_SNDBUF,	SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
69 { SO_RCVBUF,	SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
70 { SO_SNDTIMEO,	SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0,
71 	sizeof (struct timeval), 0 },
72 { SO_RCVTIMEO,	SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0,
73 	sizeof (struct timeval), 0 },
74 { SO_DGRAM_ERRIND, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0
75 	},
76 { SO_SND_COPYAVOID, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
77 { SO_ANON_MLP, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int),
78 	0 },
79 { SO_MAC_EXEMPT, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int),
80 	0 },
81 { SO_MAC_IMPLICIT, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int),
82 	0 },
83 { SO_ALLZONES, SOL_SOCKET, OA_R, OA_RW, OP_CONFIG, 0, sizeof (int),
84 	0 },
85 { SO_EXCLBIND, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
86 
87 { SO_DOMAIN,	SOL_SOCKET, OA_R, OA_R, OP_NP, 0, sizeof (int), 0 },
88 
89 { SO_PROTOTYPE,	SOL_SOCKET, OA_R, OA_R, OP_NP, 0, sizeof (int), 0 },
90 
91 { TCP_NODELAY,	IPPROTO_TCP, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0
92 	},
93 { TCP_MAXSEG,	IPPROTO_TCP, OA_R, OA_R, OP_NP, 0, sizeof (uint_t),
94 	536 },
95 
96 { TCP_NOTIFY_THRESHOLD, IPPROTO_TCP, OA_RW, OA_RW, OP_NP,
97 	OP_DEF_FN, sizeof (int), -1 /* not initialized */ },
98 
99 { TCP_ABORT_THRESHOLD, IPPROTO_TCP, OA_RW, OA_RW, OP_NP,
100 	OP_DEF_FN, sizeof (int), -1 /* not initialized */ },
101 
102 { TCP_CONN_NOTIFY_THRESHOLD, IPPROTO_TCP, OA_RW, OA_RW, OP_NP,
103 	OP_DEF_FN, sizeof (int), -1 /* not initialized */ },
104 
105 { TCP_CONN_ABORT_THRESHOLD, IPPROTO_TCP, OA_RW, OA_RW, OP_NP,
106 	OP_DEF_FN, sizeof (int), -1 /* not initialized */ },
107 
108 { TCP_RECVDSTADDR, IPPROTO_TCP, OA_RW, OA_RW, OP_NP, 0, sizeof (int),
109 	0 },
110 
111 { TCP_ANONPRIVBIND, IPPROTO_TCP, OA_R, OA_RW, OP_PRIVPORT, 0,
112 	sizeof (int), 0 },
113 
114 { TCP_EXCLBIND, IPPROTO_TCP, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0
115 	},
116 
117 { TCP_INIT_CWND, IPPROTO_TCP, OA_RW, OA_RW, OP_CONFIG, 0,
118 	sizeof (int), 0 },
119 
120 { TCP_KEEPALIVE_THRESHOLD, IPPROTO_TCP, OA_RW, OA_RW, OP_NP, 0,
121 	sizeof (int), 0	},
122 
123 { TCP_KEEPIDLE, IPPROTO_TCP, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
124 
125 { TCP_KEEPCNT, IPPROTO_TCP, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
126 
127 { TCP_KEEPINTVL, IPPROTO_TCP, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
128 
129 { TCP_KEEPALIVE_ABORT_THRESHOLD, IPPROTO_TCP, OA_RW, OA_RW, OP_NP, 0,
130 	sizeof (int), 0	},
131 
132 { TCP_CORK, IPPROTO_TCP, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
133 
134 { TCP_RTO_INITIAL, IPPROTO_TCP, OA_RW, OA_RW, OP_NP, 0, sizeof (uint32_t), 0 },
135 
136 { TCP_RTO_MIN, IPPROTO_TCP, OA_RW, OA_RW, OP_NP, 0, sizeof (uint32_t), 0 },
137 
138 { TCP_RTO_MAX, IPPROTO_TCP, OA_RW, OA_RW, OP_NP, 0, sizeof (uint32_t), 0 },
139 
140 { TCP_LINGER2, IPPROTO_TCP, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
141 
142 { IP_OPTIONS,	IPPROTO_IP, OA_RW, OA_RW, OP_NP,
143 	(OP_VARLEN|OP_NODEFAULT),
144 	IP_MAX_OPT_LENGTH + IP_ADDR_LEN, -1 /* not initialized */ },
145 { T_IP_OPTIONS,	IPPROTO_IP, OA_RW, OA_RW, OP_NP,
146 	(OP_VARLEN|OP_NODEFAULT),
147 	IP_MAX_OPT_LENGTH + IP_ADDR_LEN, -1 /* not initialized */ },
148 
149 { IP_TOS,	IPPROTO_IP, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
150 { T_IP_TOS,	IPPROTO_IP, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
151 { IP_TTL,	IPPROTO_IP, OA_RW, OA_RW, OP_NP, OP_DEF_FN,
152 	sizeof (int), -1 /* not initialized */ },
153 
154 { IP_SEC_OPT, IPPROTO_IP, OA_RW, OA_RW, OP_NP, OP_NODEFAULT,
155 	sizeof (ipsec_req_t), -1 /* not initialized */ },
156 
157 { IP_BOUND_IF, IPPROTO_IP, OA_RW, OA_RW, OP_NP, 0,
158 	sizeof (int),	0 /* no ifindex */ },
159 
160 { IP_UNSPEC_SRC, IPPROTO_IP, OA_R, OA_RW, OP_RAW, 0,
161 	sizeof (int), 0 },
162 
163 { IPV6_UNICAST_HOPS, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, OP_DEF_FN,
164 	sizeof (int), -1 /* not initialized */ },
165 
166 { IPV6_BOUND_IF, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 0,
167 	sizeof (int),	0 /* no ifindex */ },
168 
169 { IP_DONTFRAG, IPPROTO_IP, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
170 
171 { IP_NEXTHOP, IPPROTO_IP, OA_R, OA_RW, OP_CONFIG, 0,
172 	sizeof (in_addr_t),	-1 /* not initialized  */ },
173 
174 { IPV6_UNSPEC_SRC, IPPROTO_IPV6, OA_R, OA_RW, OP_RAW, 0,
175 	sizeof (int), 0 },
176 
177 { IPV6_PKTINFO, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP,
178 	(OP_NODEFAULT|OP_VARLEN),
179 	sizeof (struct in6_pktinfo), -1 /* not initialized */ },
180 { IPV6_NEXTHOP, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP,
181 	OP_NODEFAULT,
182 	sizeof (sin6_t), -1 /* not initialized */ },
183 { IPV6_HOPOPTS, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP,
184 	(OP_VARLEN|OP_NODEFAULT), 255*8,
185 	-1 /* not initialized */ },
186 { IPV6_DSTOPTS, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP,
187 	(OP_VARLEN|OP_NODEFAULT), 255*8,
188 	-1 /* not initialized */ },
189 { IPV6_RTHDRDSTOPTS, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP,
190 	(OP_VARLEN|OP_NODEFAULT), 255*8,
191 	-1 /* not initialized */ },
192 { IPV6_RTHDR, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP,
193 	(OP_VARLEN|OP_NODEFAULT), 255*8,
194 	-1 /* not initialized */ },
195 { IPV6_TCLASS, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP,
196 	OP_NODEFAULT,
197 	sizeof (int), -1 /* not initialized */ },
198 { IPV6_PATHMTU, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP,
199 	OP_NODEFAULT,
200 	sizeof (struct ip6_mtuinfo), -1 /* not initialized */ },
201 { IPV6_DONTFRAG, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 0,
202 	sizeof (int), 0 },
203 { IPV6_USE_MIN_MTU, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 0,
204 	sizeof (int), 0 },
205 { IPV6_V6ONLY, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 0,
206 	sizeof (int), 0 },
207 
208 /* Enable receipt of ancillary data */
209 { IPV6_RECVPKTINFO, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 0,
210 	sizeof (int), 0 },
211 { IPV6_RECVHOPLIMIT, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 0,
212 	sizeof (int), 0 },
213 { IPV6_RECVHOPOPTS, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 0,
214 	sizeof (int), 0 },
215 { _OLD_IPV6_RECVDSTOPTS, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 0,
216 	sizeof (int), 0 },
217 { IPV6_RECVDSTOPTS, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 0,
218 	sizeof (int), 0 },
219 { IPV6_RECVRTHDR, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 0,
220 	sizeof (int), 0 },
221 { IPV6_RECVRTHDRDSTOPTS, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 0,
222 	sizeof (int), 0 },
223 { IPV6_RECVTCLASS, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 0,
224 	sizeof (int), 0 },
225 
226 { IPV6_SEC_OPT, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, OP_NODEFAULT,
227 	sizeof (ipsec_req_t), -1 /* not initialized */ },
228 { IPV6_SRC_PREFERENCES, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 0,
229 	sizeof (uint32_t), IPV6_PREFER_SRC_DEFAULT },
230 };
231 
232 /*
233  * Table of all supported levels
234  * Note: Some levels (e.g. XTI_GENERIC) may be valid but may not have
235  * any supported options so we need this info separately.
236  *
237  * This is needed only for topmost tpi providers and is used only by
238  * XTI interfaces.
239  */
240 optlevel_t	tcp_valid_levels_arr[] = {
241 	XTI_GENERIC,
242 	SOL_SOCKET,
243 	IPPROTO_TCP,
244 	IPPROTO_IP,
245 	IPPROTO_IPV6
246 };
247 
248 
249 #define	TCP_OPT_ARR_CNT		A_CNT(tcp_opt_arr)
250 #define	TCP_VALID_LEVELS_CNT	A_CNT(tcp_valid_levels_arr)
251 
252 uint_t tcp_max_optsize; /* initialized when TCP driver is loaded */
253 
254 /*
255  * Initialize option database object for TCP
256  *
257  * This object represents database of options to search passed to
258  * {sock,tpi}optcom_req() interface routine to take care of option
259  * management and associated methods.
260  */
261 
262 optdb_obj_t tcp_opt_obj = {
263 	tcp_opt_default,	/* TCP default value function pointer */
264 	tcp_tpi_opt_get,	/* TCP get function pointer */
265 	tcp_tpi_opt_set,	/* TCP set function pointer */
266 	TCP_OPT_ARR_CNT,	/* TCP option database count of entries */
267 	tcp_opt_arr,		/* TCP option database */
268 	TCP_VALID_LEVELS_CNT,	/* TCP valid level count of entries */
269 	tcp_valid_levels_arr	/* TCP valid level array */
270 };
271 
272 static int tcp_max_init_cwnd = TCP_MAX_INIT_CWND;
273 
274 /*
275  * Some TCP options can be "set" by requesting them in the option
276  * buffer. This is needed for XTI feature test though we do not
277  * allow it in general. We interpret that this mechanism is more
278  * applicable to OSI protocols and need not be allowed in general.
279  * This routine filters out options for which it is not allowed (most)
280  * and lets through those (few) for which it is. [ The XTI interface
281  * test suite specifics will imply that any XTI_GENERIC level XTI_* if
282  * ever implemented will have to be allowed here ].
283  */
284 static boolean_t
285 tcp_allow_connopt_set(int level, int name)
286 {
287 
288 	switch (level) {
289 	case IPPROTO_TCP:
290 		switch (name) {
291 		case TCP_NODELAY:
292 			return (B_TRUE);
293 		default:
294 			return (B_FALSE);
295 		}
296 		/*NOTREACHED*/
297 	default:
298 		return (B_FALSE);
299 	}
300 	/*NOTREACHED*/
301 }
302 
303 /*
304  * This routine gets default values of certain options whose default
305  * values are maintained by protocol specific code
306  */
307 /* ARGSUSED */
308 static int
309 tcp_opt_default(queue_t *q, int level, int name, uchar_t *ptr)
310 {
311 	int32_t	*i1 = (int32_t *)ptr;
312 	tcp_stack_t	*tcps = Q_TO_TCP(q)->tcp_tcps;
313 
314 	switch (level) {
315 	case IPPROTO_TCP:
316 		switch (name) {
317 		case TCP_NOTIFY_THRESHOLD:
318 			*i1 = tcps->tcps_ip_notify_interval;
319 			break;
320 		case TCP_ABORT_THRESHOLD:
321 			*i1 = tcps->tcps_ip_abort_interval;
322 			break;
323 		case TCP_CONN_NOTIFY_THRESHOLD:
324 			*i1 = tcps->tcps_ip_notify_cinterval;
325 			break;
326 		case TCP_CONN_ABORT_THRESHOLD:
327 			*i1 = tcps->tcps_ip_abort_cinterval;
328 			break;
329 		default:
330 			return (-1);
331 		}
332 		break;
333 	case IPPROTO_IP:
334 		switch (name) {
335 		case IP_TTL:
336 			*i1 = tcps->tcps_ipv4_ttl;
337 			break;
338 		default:
339 			return (-1);
340 		}
341 		break;
342 	case IPPROTO_IPV6:
343 		switch (name) {
344 		case IPV6_UNICAST_HOPS:
345 			*i1 = tcps->tcps_ipv6_hoplimit;
346 			break;
347 		default:
348 			return (-1);
349 		}
350 		break;
351 	default:
352 		return (-1);
353 	}
354 	return (sizeof (int));
355 }
356 
357 /*
358  * TCP routine to get the values of options.
359  */
360 int
361 tcp_opt_get(conn_t *connp, int level, int name, uchar_t *ptr)
362 {
363 	int		*i1 = (int *)ptr;
364 	tcp_t		*tcp = connp->conn_tcp;
365 	conn_opt_arg_t	coas;
366 	int		retval;
367 
368 	coas.coa_connp = connp;
369 	coas.coa_ixa = connp->conn_ixa;
370 	coas.coa_ipp = &connp->conn_xmit_ipp;
371 	coas.coa_ancillary = B_FALSE;
372 	coas.coa_changed = 0;
373 
374 	switch (level) {
375 	case SOL_SOCKET:
376 		switch (name) {
377 		case SO_SND_COPYAVOID:
378 			*i1 = tcp->tcp_snd_zcopy_on ?
379 			    SO_SND_COPYAVOID : 0;
380 			return (sizeof (int));
381 		case SO_ACCEPTCONN:
382 			*i1 = (tcp->tcp_state == TCPS_LISTEN);
383 			return (sizeof (int));
384 		}
385 		break;
386 	case IPPROTO_TCP:
387 		switch (name) {
388 		case TCP_NODELAY:
389 			*i1 = (tcp->tcp_naglim == 1) ? TCP_NODELAY : 0;
390 			return (sizeof (int));
391 		case TCP_MAXSEG:
392 			*i1 = tcp->tcp_mss;
393 			return (sizeof (int));
394 		case TCP_NOTIFY_THRESHOLD:
395 			*i1 = (int)tcp->tcp_first_timer_threshold;
396 			return (sizeof (int));
397 		case TCP_ABORT_THRESHOLD:
398 			*i1 = tcp->tcp_second_timer_threshold;
399 			return (sizeof (int));
400 		case TCP_CONN_NOTIFY_THRESHOLD:
401 			*i1 = tcp->tcp_first_ctimer_threshold;
402 			return (sizeof (int));
403 		case TCP_CONN_ABORT_THRESHOLD:
404 			*i1 = tcp->tcp_second_ctimer_threshold;
405 			return (sizeof (int));
406 		case TCP_INIT_CWND:
407 			*i1 = tcp->tcp_init_cwnd;
408 			return (sizeof (int));
409 		case TCP_KEEPALIVE_THRESHOLD:
410 			*i1 = tcp->tcp_ka_interval;
411 			return (sizeof (int));
412 
413 		/*
414 		 * TCP_KEEPIDLE expects value in seconds, but
415 		 * tcp_ka_interval is in milliseconds.
416 		 */
417 		case TCP_KEEPIDLE:
418 			*i1 = tcp->tcp_ka_interval / 1000;
419 			return (sizeof (int));
420 		case TCP_KEEPCNT:
421 			*i1 = tcp->tcp_ka_cnt;
422 			return (sizeof (int));
423 
424 		/*
425 		 * TCP_KEEPINTVL expects value in seconds, but
426 		 * tcp_ka_rinterval is in milliseconds.
427 		 */
428 		case TCP_KEEPINTVL:
429 			*i1 = tcp->tcp_ka_rinterval / 1000;
430 			return (sizeof (int));
431 		case TCP_KEEPALIVE_ABORT_THRESHOLD:
432 			*i1 = tcp->tcp_ka_abort_thres;
433 			return (sizeof (int));
434 		case TCP_CORK:
435 			*i1 = tcp->tcp_cork;
436 			return (sizeof (int));
437 		case TCP_RTO_INITIAL:
438 			*i1 = tcp->tcp_rto_initial;
439 			return (sizeof (uint32_t));
440 		case TCP_RTO_MIN:
441 			*i1 = tcp->tcp_rto_min;
442 			return (sizeof (uint32_t));
443 		case TCP_RTO_MAX:
444 			*i1 = tcp->tcp_rto_max;
445 			return (sizeof (uint32_t));
446 		case TCP_LINGER2:
447 			*i1 = tcp->tcp_fin_wait_2_flush_interval / SECONDS;
448 			return (sizeof (int));
449 		}
450 		break;
451 	case IPPROTO_IP:
452 		if (connp->conn_family != AF_INET)
453 			return (-1);
454 		switch (name) {
455 		case IP_OPTIONS:
456 		case T_IP_OPTIONS:
457 			/* Caller ensures enough space */
458 			return (ip_opt_get_user(connp, ptr));
459 		default:
460 			break;
461 		}
462 		break;
463 
464 	case IPPROTO_IPV6:
465 		/*
466 		 * IPPROTO_IPV6 options are only supported for sockets
467 		 * that are using IPv6 on the wire.
468 		 */
469 		if (connp->conn_ipversion != IPV6_VERSION) {
470 			return (-1);
471 		}
472 		switch (name) {
473 		case IPV6_PATHMTU:
474 			if (tcp->tcp_state < TCPS_ESTABLISHED)
475 				return (-1);
476 			break;
477 		}
478 		break;
479 	}
480 	mutex_enter(&connp->conn_lock);
481 	retval = conn_opt_get(&coas, level, name, ptr);
482 	mutex_exit(&connp->conn_lock);
483 	return (retval);
484 }
485 
486 /*
487  * We declare as 'int' rather than 'void' to satisfy pfi_t arg requirements.
488  * Parameters are assumed to be verified by the caller.
489  */
490 /* ARGSUSED */
491 int
492 tcp_opt_set(conn_t *connp, uint_t optset_context, int level, int name,
493     uint_t inlen, uchar_t *invalp, uint_t *outlenp, uchar_t *outvalp,
494     void *thisdg_attrs, cred_t *cr)
495 {
496 	tcp_t	*tcp = connp->conn_tcp;
497 	int	*i1 = (int *)invalp;
498 	boolean_t onoff = (*i1 == 0) ? 0 : 1;
499 	boolean_t checkonly;
500 	int	reterr;
501 	tcp_stack_t	*tcps = tcp->tcp_tcps;
502 	conn_opt_arg_t	coas;
503 	uint32_t	val = *((uint32_t *)invalp);
504 
505 	coas.coa_connp = connp;
506 	coas.coa_ixa = connp->conn_ixa;
507 	coas.coa_ipp = &connp->conn_xmit_ipp;
508 	coas.coa_ancillary = B_FALSE;
509 	coas.coa_changed = 0;
510 
511 	switch (optset_context) {
512 	case SETFN_OPTCOM_CHECKONLY:
513 		checkonly = B_TRUE;
514 		/*
515 		 * Note: Implies T_CHECK semantics for T_OPTCOM_REQ
516 		 * inlen != 0 implies value supplied and
517 		 * 	we have to "pretend" to set it.
518 		 * inlen == 0 implies that there is no
519 		 * 	value part in T_CHECK request and just validation
520 		 * done elsewhere should be enough, we just return here.
521 		 */
522 		if (inlen == 0) {
523 			*outlenp = 0;
524 			return (0);
525 		}
526 		break;
527 	case SETFN_OPTCOM_NEGOTIATE:
528 		checkonly = B_FALSE;
529 		break;
530 	case SETFN_UD_NEGOTIATE: /* error on conn-oriented transports ? */
531 	case SETFN_CONN_NEGOTIATE:
532 		checkonly = B_FALSE;
533 		/*
534 		 * Negotiating local and "association-related" options
535 		 * from other (T_CONN_REQ, T_CONN_RES,T_UNITDATA_REQ)
536 		 * primitives is allowed by XTI, but we choose
537 		 * to not implement this style negotiation for Internet
538 		 * protocols (We interpret it is a must for OSI world but
539 		 * optional for Internet protocols) for all options.
540 		 * [ Will do only for the few options that enable test
541 		 * suites that our XTI implementation of this feature
542 		 * works for transports that do allow it ]
543 		 */
544 		if (!tcp_allow_connopt_set(level, name)) {
545 			*outlenp = 0;
546 			return (EINVAL);
547 		}
548 		break;
549 	default:
550 		/*
551 		 * We should never get here
552 		 */
553 		*outlenp = 0;
554 		return (EINVAL);
555 	}
556 
557 	ASSERT((optset_context != SETFN_OPTCOM_CHECKONLY) ||
558 	    (optset_context == SETFN_OPTCOM_CHECKONLY && inlen != 0));
559 
560 	/*
561 	 * For TCP, we should have no ancillary data sent down
562 	 * (sendmsg isn't supported for SOCK_STREAM), so thisdg_attrs
563 	 * has to be zero.
564 	 */
565 	ASSERT(thisdg_attrs == NULL);
566 
567 	/*
568 	 * For fixed length options, no sanity check
569 	 * of passed in length is done. It is assumed *_optcom_req()
570 	 * routines do the right thing.
571 	 */
572 	switch (level) {
573 	case SOL_SOCKET:
574 		switch (name) {
575 		case SO_KEEPALIVE:
576 			if (checkonly) {
577 				/* check only case */
578 				break;
579 			}
580 
581 			if (!onoff) {
582 				if (connp->conn_keepalive) {
583 					if (tcp->tcp_ka_tid != 0) {
584 						(void) TCP_TIMER_CANCEL(tcp,
585 						    tcp->tcp_ka_tid);
586 						tcp->tcp_ka_tid = 0;
587 					}
588 					connp->conn_keepalive = 0;
589 				}
590 				break;
591 			}
592 			if (!connp->conn_keepalive) {
593 				/* Crank up the keepalive timer */
594 				tcp->tcp_ka_last_intrvl = 0;
595 				tcp->tcp_ka_tid = TCP_TIMER(tcp,
596 				    tcp_keepalive_timer, tcp->tcp_ka_interval);
597 				connp->conn_keepalive = 1;
598 			}
599 			break;
600 		case SO_SNDBUF: {
601 			if (*i1 > tcps->tcps_max_buf) {
602 				*outlenp = 0;
603 				return (ENOBUFS);
604 			}
605 			if (checkonly)
606 				break;
607 
608 			connp->conn_sndbuf = *i1;
609 			if (tcps->tcps_snd_lowat_fraction != 0) {
610 				connp->conn_sndlowat = connp->conn_sndbuf /
611 				    tcps->tcps_snd_lowat_fraction;
612 			}
613 			(void) tcp_maxpsz_set(tcp, B_TRUE);
614 			/*
615 			 * If we are flow-controlled, recheck the condition.
616 			 * There are apps that increase SO_SNDBUF size when
617 			 * flow-controlled (EWOULDBLOCK), and expect the flow
618 			 * control condition to be lifted right away.
619 			 */
620 			mutex_enter(&tcp->tcp_non_sq_lock);
621 			if (tcp->tcp_flow_stopped &&
622 			    TCP_UNSENT_BYTES(tcp) < connp->conn_sndbuf) {
623 				tcp_clrqfull(tcp);
624 			}
625 			mutex_exit(&tcp->tcp_non_sq_lock);
626 			*outlenp = inlen;
627 			return (0);
628 		}
629 		case SO_RCVBUF:
630 			if (*i1 > tcps->tcps_max_buf) {
631 				*outlenp = 0;
632 				return (ENOBUFS);
633 			}
634 			/* Silently ignore zero */
635 			if (!checkonly && *i1 != 0) {
636 				*i1 = MSS_ROUNDUP(*i1, tcp->tcp_mss);
637 				(void) tcp_rwnd_set(tcp, *i1);
638 			}
639 			/*
640 			 * XXX should we return the rwnd here
641 			 * and tcp_opt_get ?
642 			 */
643 			*outlenp = inlen;
644 			return (0);
645 		case SO_SND_COPYAVOID:
646 			if (!checkonly) {
647 				if (tcp->tcp_loopback ||
648 				    (onoff != 1) || !tcp_zcopy_check(tcp)) {
649 					*outlenp = 0;
650 					return (EOPNOTSUPP);
651 				}
652 				tcp->tcp_snd_zcopy_aware = 1;
653 			}
654 			*outlenp = inlen;
655 			return (0);
656 		}
657 		break;
658 	case IPPROTO_TCP:
659 		switch (name) {
660 		case TCP_NODELAY:
661 			if (!checkonly)
662 				tcp->tcp_naglim = *i1 ? 1 : tcp->tcp_mss;
663 			break;
664 		case TCP_NOTIFY_THRESHOLD:
665 			if (!checkonly)
666 				tcp->tcp_first_timer_threshold = *i1;
667 			break;
668 		case TCP_ABORT_THRESHOLD:
669 			if (!checkonly)
670 				tcp->tcp_second_timer_threshold = *i1;
671 			break;
672 		case TCP_CONN_NOTIFY_THRESHOLD:
673 			if (!checkonly)
674 				tcp->tcp_first_ctimer_threshold = *i1;
675 			break;
676 		case TCP_CONN_ABORT_THRESHOLD:
677 			if (!checkonly)
678 				tcp->tcp_second_ctimer_threshold = *i1;
679 			break;
680 		case TCP_RECVDSTADDR:
681 			if (tcp->tcp_state > TCPS_LISTEN) {
682 				*outlenp = 0;
683 				return (EOPNOTSUPP);
684 			}
685 			/* Setting done in conn_opt_set */
686 			break;
687 		case TCP_INIT_CWND:
688 			if (checkonly)
689 				break;
690 
691 			/*
692 			 * Only allow socket with network configuration
693 			 * privilege to set the initial cwnd to be larger
694 			 * than allowed by RFC 3390.
695 			 */
696 			if (val > MIN(4, MAX(2, 4380 / tcp->tcp_mss))) {
697 				if ((reterr = secpolicy_ip_config(cr, B_TRUE))
698 				    != 0) {
699 					*outlenp = 0;
700 					return (reterr);
701 				}
702 				if (val > tcp_max_init_cwnd) {
703 					*outlenp = 0;
704 					return (EINVAL);
705 				}
706 			}
707 
708 			tcp->tcp_init_cwnd = val;
709 
710 			/*
711 			 * If the socket is connected, AND no outbound data
712 			 * has been sent, reset the actual cwnd values.
713 			 */
714 			if (tcp->tcp_state == TCPS_ESTABLISHED &&
715 			    tcp->tcp_iss == tcp->tcp_snxt - 1) {
716 				tcp->tcp_cwnd =
717 				    MIN(tcp->tcp_rwnd, val * tcp->tcp_mss);
718 			}
719 			break;
720 
721 		/*
722 		 * TCP_KEEPIDLE is in seconds but TCP_KEEPALIVE_THRESHOLD
723 		 * is in milliseconds. TCP_KEEPIDLE is introduced for
724 		 * compatibility with other Unix flavors.
725 		 * We can fall through TCP_KEEPALIVE_THRESHOLD logic after
726 		 * converting the input to milliseconds.
727 		 */
728 		case TCP_KEEPIDLE:
729 			*i1 *= 1000;
730 			/* FALLTHRU */
731 
732 		case TCP_KEEPALIVE_THRESHOLD:
733 			if (checkonly)
734 				break;
735 
736 			if (*i1 < tcps->tcps_keepalive_interval_low ||
737 			    *i1 > tcps->tcps_keepalive_interval_high) {
738 				*outlenp = 0;
739 				return (EINVAL);
740 			}
741 			if (*i1 != tcp->tcp_ka_interval) {
742 				tcp->tcp_ka_interval = *i1;
743 				/*
744 				 * Check if we need to restart the
745 				 * keepalive timer.
746 				 */
747 				if (tcp->tcp_ka_tid != 0) {
748 					ASSERT(connp->conn_keepalive);
749 					(void) TCP_TIMER_CANCEL(tcp,
750 					    tcp->tcp_ka_tid);
751 					tcp->tcp_ka_last_intrvl = 0;
752 					tcp->tcp_ka_tid = TCP_TIMER(tcp,
753 					    tcp_keepalive_timer,
754 					    tcp->tcp_ka_interval);
755 				}
756 			}
757 			break;
758 
759 		/*
760 		 * tcp_ka_abort_thres = tcp_ka_rinterval * tcp_ka_cnt.
761 		 * So setting TCP_KEEPCNT or TCP_KEEPINTVL can affect all the
762 		 * three members - tcp_ka_abort_thres, tcp_ka_rinterval and
763 		 * tcp_ka_cnt.
764 		 */
765 		case TCP_KEEPCNT:
766 			if (checkonly)
767 				break;
768 
769 			if (*i1 == 0) {
770 				return (EINVAL);
771 			} else if (tcp->tcp_ka_rinterval == 0) {
772 				if ((tcp->tcp_ka_abort_thres / *i1) <
773 				    tcp->tcp_rto_min ||
774 				    (tcp->tcp_ka_abort_thres / *i1) >
775 				    tcp->tcp_rto_max)
776 					return (EINVAL);
777 
778 				tcp->tcp_ka_rinterval =
779 				    tcp->tcp_ka_abort_thres / *i1;
780 			} else {
781 				if ((*i1 * tcp->tcp_ka_rinterval) <
782 				    tcps->tcps_keepalive_abort_interval_low ||
783 				    (*i1 * tcp->tcp_ka_rinterval) >
784 				    tcps->tcps_keepalive_abort_interval_high)
785 					return (EINVAL);
786 				tcp->tcp_ka_abort_thres =
787 				    (*i1 * tcp->tcp_ka_rinterval);
788 			}
789 			tcp->tcp_ka_cnt = *i1;
790 			break;
791 		case TCP_KEEPINTVL:
792 			/*
793 			 * TCP_KEEPINTVL is specified in seconds, but
794 			 * tcp_ka_rinterval is in milliseconds.
795 			 */
796 
797 			if (checkonly)
798 				break;
799 
800 			if ((*i1 * 1000) < tcp->tcp_rto_min ||
801 			    (*i1 * 1000) > tcp->tcp_rto_max)
802 				return (EINVAL);
803 
804 			if (tcp->tcp_ka_cnt == 0) {
805 				tcp->tcp_ka_cnt =
806 				    tcp->tcp_ka_abort_thres / (*i1 * 1000);
807 			} else {
808 				if ((*i1 * tcp->tcp_ka_cnt * 1000) <
809 				    tcps->tcps_keepalive_abort_interval_low ||
810 				    (*i1 * tcp->tcp_ka_cnt * 1000) >
811 				    tcps->tcps_keepalive_abort_interval_high)
812 					return (EINVAL);
813 				tcp->tcp_ka_abort_thres =
814 				    (*i1 * tcp->tcp_ka_cnt * 1000);
815 			}
816 			tcp->tcp_ka_rinterval = *i1 * 1000;
817 			break;
818 		case TCP_KEEPALIVE_ABORT_THRESHOLD:
819 			if (!checkonly) {
820 				if (*i1 <
821 				    tcps->tcps_keepalive_abort_interval_low ||
822 				    *i1 >
823 				    tcps->tcps_keepalive_abort_interval_high) {
824 					*outlenp = 0;
825 					return (EINVAL);
826 				}
827 				tcp->tcp_ka_abort_thres = *i1;
828 				tcp->tcp_ka_cnt = 0;
829 				tcp->tcp_ka_rinterval = 0;
830 			}
831 			break;
832 		case TCP_CORK:
833 			if (!checkonly) {
834 				/*
835 				 * if tcp->tcp_cork was set and is now
836 				 * being unset, we have to make sure that
837 				 * the remaining data gets sent out. Also
838 				 * unset tcp->tcp_cork so that tcp_wput_data()
839 				 * can send data even if it is less than mss
840 				 */
841 				if (tcp->tcp_cork && onoff == 0 &&
842 				    tcp->tcp_unsent > 0) {
843 					tcp->tcp_cork = B_FALSE;
844 					tcp_wput_data(tcp, NULL, B_FALSE);
845 				}
846 				tcp->tcp_cork = onoff;
847 			}
848 			break;
849 		case TCP_RTO_INITIAL: {
850 			clock_t rto;
851 
852 			if (checkonly || val == 0)
853 				break;
854 
855 			/*
856 			 * Sanity checks
857 			 *
858 			 * The initial RTO should be bounded by the minimum
859 			 * and maximum RTO.  And it should also be smaller
860 			 * than the connect attempt abort timeout.  Otherwise,
861 			 * the connection won't be aborted in a period
862 			 * reasonably close to that timeout.
863 			 */
864 			if (val < tcp->tcp_rto_min || val > tcp->tcp_rto_max ||
865 			    val > tcp->tcp_second_ctimer_threshold ||
866 			    val < tcps->tcps_rexmit_interval_initial_low ||
867 			    val > tcps->tcps_rexmit_interval_initial_high) {
868 				*outlenp = 0;
869 				return (EINVAL);
870 			}
871 			tcp->tcp_rto_initial = val;
872 
873 			/*
874 			 * If TCP has not sent anything, need to re-calculate
875 			 * tcp_rto.  Otherwise, this option change does not
876 			 * really affect anything.
877 			 */
878 			if (tcp->tcp_state >= TCPS_SYN_SENT)
879 				break;
880 
881 			tcp->tcp_rtt_sa = tcp->tcp_rto_initial << 2;
882 			tcp->tcp_rtt_sd = tcp->tcp_rto_initial >> 1;
883 			rto = (tcp->tcp_rtt_sa >> 3) + tcp->tcp_rtt_sd +
884 			    tcps->tcps_rexmit_interval_extra +
885 			    (tcp->tcp_rtt_sa >> 5) +
886 			    tcps->tcps_conn_grace_period;
887 			TCP_SET_RTO(tcp, rto);
888 			break;
889 		}
890 		case TCP_RTO_MIN:
891 			if (checkonly || val == 0)
892 				break;
893 
894 			if (val < tcps->tcps_rexmit_interval_min_low ||
895 			    val > tcps->tcps_rexmit_interval_min_high ||
896 			    val > tcp->tcp_rto_max) {
897 				*outlenp = 0;
898 				return (EINVAL);
899 			}
900 			tcp->tcp_rto_min = val;
901 			if (tcp->tcp_rto < val)
902 				tcp->tcp_rto = val;
903 			break;
904 		case TCP_RTO_MAX:
905 			if (checkonly || val == 0)
906 				break;
907 
908 			/*
909 			 * Sanity checks
910 			 *
911 			 * The maximum RTO should not be larger than the
912 			 * connection abort timeout.  Otherwise, the
913 			 * connection won't be aborted in a period reasonably
914 			 * close to that timeout.
915 			 */
916 			if (val < tcps->tcps_rexmit_interval_max_low ||
917 			    val > tcps->tcps_rexmit_interval_max_high ||
918 			    val < tcp->tcp_rto_min ||
919 			    val > tcp->tcp_second_timer_threshold) {
920 				*outlenp = 0;
921 				return (EINVAL);
922 			}
923 			tcp->tcp_rto_max = val;
924 			if (tcp->tcp_rto > val)
925 				tcp->tcp_rto = val;
926 			break;
927 		case TCP_LINGER2:
928 			if (checkonly || *i1 == 0)
929 				break;
930 
931 			/*
932 			 * Note that the option value's unit is second.  And
933 			 * the value should be bigger than the private
934 			 * parameter tcp_fin_wait_2_flush_interval's lower
935 			 * bound and smaller than the current value of that
936 			 * parameter.  It should be smaller than the current
937 			 * value to avoid an app setting TCP_LINGER2 to a big
938 			 * value, causing resource to be held up too long in
939 			 * FIN-WAIT-2 state.
940 			 */
941 			if (*i1 < 0 ||
942 			    tcps->tcps_fin_wait_2_flush_interval_low/SECONDS >
943 			    *i1 ||
944 			    tcps->tcps_fin_wait_2_flush_interval/SECONDS <
945 			    *i1) {
946 				*outlenp = 0;
947 				return (EINVAL);
948 			}
949 			tcp->tcp_fin_wait_2_flush_interval = *i1 * SECONDS;
950 			break;
951 		default:
952 			break;
953 		}
954 		break;
955 	case IPPROTO_IP:
956 		if (connp->conn_family != AF_INET) {
957 			*outlenp = 0;
958 			return (EINVAL);
959 		}
960 		switch (name) {
961 		case IP_SEC_OPT:
962 			/*
963 			 * We should not allow policy setting after
964 			 * we start listening for connections.
965 			 */
966 			if (tcp->tcp_state == TCPS_LISTEN) {
967 				return (EINVAL);
968 			}
969 			break;
970 		}
971 		break;
972 	case IPPROTO_IPV6:
973 		/*
974 		 * IPPROTO_IPV6 options are only supported for sockets
975 		 * that are using IPv6 on the wire.
976 		 */
977 		if (connp->conn_ipversion != IPV6_VERSION) {
978 			*outlenp = 0;
979 			return (EINVAL);
980 		}
981 
982 		switch (name) {
983 		case IPV6_RECVPKTINFO:
984 			if (!checkonly) {
985 				/* Force it to be sent up with the next msg */
986 				tcp->tcp_recvifindex = 0;
987 			}
988 			break;
989 		case IPV6_RECVTCLASS:
990 			if (!checkonly) {
991 				/* Force it to be sent up with the next msg */
992 				tcp->tcp_recvtclass = 0xffffffffU;
993 			}
994 			break;
995 		case IPV6_RECVHOPLIMIT:
996 			if (!checkonly) {
997 				/* Force it to be sent up with the next msg */
998 				tcp->tcp_recvhops = 0xffffffffU;
999 			}
1000 			break;
1001 		case IPV6_PKTINFO:
1002 			/* This is an extra check for TCP */
1003 			if (inlen == sizeof (struct in6_pktinfo)) {
1004 				struct in6_pktinfo *pkti;
1005 
1006 				pkti = (struct in6_pktinfo *)invalp;
1007 				/*
1008 				 * RFC 3542 states that ipi6_addr must be
1009 				 * the unspecified address when setting the
1010 				 * IPV6_PKTINFO sticky socket option on a
1011 				 * TCP socket.
1012 				 */
1013 				if (!IN6_IS_ADDR_UNSPECIFIED(&pkti->ipi6_addr))
1014 					return (EINVAL);
1015 			}
1016 			break;
1017 		case IPV6_SEC_OPT:
1018 			/*
1019 			 * We should not allow policy setting after
1020 			 * we start listening for connections.
1021 			 */
1022 			if (tcp->tcp_state == TCPS_LISTEN) {
1023 				return (EINVAL);
1024 			}
1025 			break;
1026 		}
1027 		break;
1028 	}
1029 	reterr = conn_opt_set(&coas, level, name, inlen, invalp,
1030 	    checkonly, cr);
1031 	if (reterr != 0) {
1032 		*outlenp = 0;
1033 		return (reterr);
1034 	}
1035 
1036 	/*
1037 	 * Common case of OK return with outval same as inval
1038 	 */
1039 	if (invalp != outvalp) {
1040 		/* don't trust bcopy for identical src/dst */
1041 		(void) bcopy(invalp, outvalp, inlen);
1042 	}
1043 	*outlenp = inlen;
1044 
1045 	if (coas.coa_changed & COA_HEADER_CHANGED) {
1046 		/* If we are connected we rebuilt the headers */
1047 		if (!IN6_IS_ADDR_UNSPECIFIED(&connp->conn_faddr_v6) &&
1048 		    !IN6_IS_ADDR_V4MAPPED_ANY(&connp->conn_faddr_v6)) {
1049 			reterr = tcp_build_hdrs(tcp);
1050 			if (reterr != 0)
1051 				return (reterr);
1052 		}
1053 	}
1054 	if (coas.coa_changed & COA_ROUTE_CHANGED) {
1055 		in6_addr_t nexthop;
1056 
1057 		/*
1058 		 * If we are connected we re-cache the information.
1059 		 * We ignore errors to preserve BSD behavior.
1060 		 * Note that we don't redo IPsec policy lookup here
1061 		 * since the final destination (or source) didn't change.
1062 		 */
1063 		ip_attr_nexthop(&connp->conn_xmit_ipp, connp->conn_ixa,
1064 		    &connp->conn_faddr_v6, &nexthop);
1065 
1066 		if (!IN6_IS_ADDR_UNSPECIFIED(&connp->conn_faddr_v6) &&
1067 		    !IN6_IS_ADDR_V4MAPPED_ANY(&connp->conn_faddr_v6)) {
1068 			(void) ip_attr_connect(connp, connp->conn_ixa,
1069 			    &connp->conn_laddr_v6, &connp->conn_faddr_v6,
1070 			    &nexthop, connp->conn_fport, NULL, NULL,
1071 			    IPDF_VERIFY_DST);
1072 		}
1073 	}
1074 	if ((coas.coa_changed & COA_SNDBUF_CHANGED) && !IPCL_IS_NONSTR(connp)) {
1075 		connp->conn_wq->q_hiwat = connp->conn_sndbuf;
1076 	}
1077 	if (coas.coa_changed & COA_WROFF_CHANGED) {
1078 		connp->conn_wroff = connp->conn_ht_iphc_allocated +
1079 		    tcps->tcps_wroff_xtra;
1080 		(void) proto_set_tx_wroff(connp->conn_rq, connp,
1081 		    connp->conn_wroff);
1082 	}
1083 	if (coas.coa_changed & COA_OOBINLINE_CHANGED) {
1084 		if (IPCL_IS_NONSTR(connp))
1085 			proto_set_rx_oob_opt(connp, onoff);
1086 	}
1087 	return (0);
1088 }
1089