xref: /illumos-gate/usr/src/uts/common/inet/tcp/tcp_opt_data.c (revision 3f9d6ad73e45c6823b409f93b0c8d4f62861d2d5)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
23  */
24 
25 #include <sys/types.h>
26 #include <sys/stream.h>
27 #define	_SUN_TPI_VERSION 2
28 #include <sys/tihdr.h>
29 #include <sys/socket.h>
30 #include <sys/xti_xtiopt.h>
31 #include <sys/xti_inet.h>
32 #include <sys/policy.h>
33 
34 #include <inet/common.h>
35 #include <netinet/ip6.h>
36 #include <inet/ip.h>
37 
38 #include <netinet/in.h>
39 #include <netinet/tcp.h>
40 #include <inet/optcom.h>
41 #include <inet/proto_set.h>
42 #include <inet/tcp_impl.h>
43 
44 /*
45  * Table of all known options handled on a TCP protocol stack.
46  *
47  * Note: This table contains options processed by both TCP and IP levels
48  *       and is the superset of options that can be performed on a TCP over IP
49  *       stack.
50  */
51 opdes_t	tcp_opt_arr[] = {
52 
53 { SO_LINGER,	SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0,
54 	sizeof (struct linger), 0 },
55 
56 { SO_DEBUG,	SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
57 { SO_KEEPALIVE,	SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
58 { SO_DONTROUTE,	SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
59 { SO_USELOOPBACK, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0
60 	},
61 { SO_BROADCAST,	SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
62 { SO_REUSEADDR, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
63 { SO_OOBINLINE, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
64 { SO_TYPE,	SOL_SOCKET, OA_R, OA_R, OP_NP, 0, sizeof (int), 0 },
65 { SO_SNDBUF,	SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
66 { SO_RCVBUF,	SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
67 { SO_SNDTIMEO,	SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0,
68 	sizeof (struct timeval), 0 },
69 { SO_RCVTIMEO,	SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0,
70 	sizeof (struct timeval), 0 },
71 { SO_DGRAM_ERRIND, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0
72 	},
73 { SO_SND_COPYAVOID, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
74 { SO_ANON_MLP, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int),
75 	0 },
76 { SO_MAC_EXEMPT, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int),
77 	0 },
78 { SO_MAC_IMPLICIT, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int),
79 	0 },
80 { SO_ALLZONES, SOL_SOCKET, OA_R, OA_RW, OP_CONFIG, 0, sizeof (int),
81 	0 },
82 { SO_EXCLBIND, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
83 
84 { SO_DOMAIN,	SOL_SOCKET, OA_R, OA_R, OP_NP, 0, sizeof (int), 0 },
85 
86 { SO_PROTOTYPE,	SOL_SOCKET, OA_R, OA_R, OP_NP, 0, sizeof (int), 0 },
87 
88 { TCP_NODELAY,	IPPROTO_TCP, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0
89 	},
90 { TCP_MAXSEG,	IPPROTO_TCP, OA_R, OA_R, OP_NP, 0, sizeof (uint_t),
91 	536 },
92 
93 { TCP_NOTIFY_THRESHOLD, IPPROTO_TCP, OA_RW, OA_RW, OP_NP,
94 	OP_DEF_FN, sizeof (int), -1 /* not initialized */ },
95 
96 { TCP_ABORT_THRESHOLD, IPPROTO_TCP, OA_RW, OA_RW, OP_NP,
97 	OP_DEF_FN, sizeof (int), -1 /* not initialized */ },
98 
99 { TCP_CONN_NOTIFY_THRESHOLD, IPPROTO_TCP, OA_RW, OA_RW, OP_NP,
100 	OP_DEF_FN, sizeof (int), -1 /* not initialized */ },
101 
102 { TCP_CONN_ABORT_THRESHOLD, IPPROTO_TCP, OA_RW, OA_RW, OP_NP,
103 	OP_DEF_FN, sizeof (int), -1 /* not initialized */ },
104 
105 { TCP_RECVDSTADDR, IPPROTO_TCP, OA_RW, OA_RW, OP_NP, 0, sizeof (int),
106 	0 },
107 
108 { TCP_ANONPRIVBIND, IPPROTO_TCP, OA_R, OA_RW, OP_PRIVPORT, 0,
109 	sizeof (int), 0 },
110 
111 { TCP_EXCLBIND, IPPROTO_TCP, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0
112 	},
113 
114 { TCP_INIT_CWND, IPPROTO_TCP, OA_RW, OA_RW, OP_CONFIG, 0,
115 	sizeof (int), 0 },
116 
117 { TCP_KEEPALIVE_THRESHOLD, IPPROTO_TCP, OA_RW, OA_RW, OP_NP, 0,
118 	sizeof (int), 0	},
119 
120 { TCP_KEEPALIVE_ABORT_THRESHOLD, IPPROTO_TCP, OA_RW, OA_RW, OP_NP, 0,
121 	sizeof (int), 0	},
122 
123 { TCP_CORK, IPPROTO_TCP, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
124 
125 { IP_OPTIONS,	IPPROTO_IP, OA_RW, OA_RW, OP_NP,
126 	(OP_VARLEN|OP_NODEFAULT),
127 	IP_MAX_OPT_LENGTH + IP_ADDR_LEN, -1 /* not initialized */ },
128 { T_IP_OPTIONS,	IPPROTO_IP, OA_RW, OA_RW, OP_NP,
129 	(OP_VARLEN|OP_NODEFAULT),
130 	IP_MAX_OPT_LENGTH + IP_ADDR_LEN, -1 /* not initialized */ },
131 
132 { IP_TOS,	IPPROTO_IP, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
133 { T_IP_TOS,	IPPROTO_IP, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
134 { IP_TTL,	IPPROTO_IP, OA_RW, OA_RW, OP_NP, OP_DEF_FN,
135 	sizeof (int), -1 /* not initialized */ },
136 
137 { IP_SEC_OPT, IPPROTO_IP, OA_RW, OA_RW, OP_NP, OP_NODEFAULT,
138 	sizeof (ipsec_req_t), -1 /* not initialized */ },
139 
140 { IP_BOUND_IF, IPPROTO_IP, OA_RW, OA_RW, OP_NP, 0,
141 	sizeof (int),	0 /* no ifindex */ },
142 
143 { IP_UNSPEC_SRC, IPPROTO_IP, OA_R, OA_RW, OP_RAW, 0,
144 	sizeof (int), 0 },
145 
146 { IPV6_UNICAST_HOPS, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, OP_DEF_FN,
147 	sizeof (int), -1 /* not initialized */ },
148 
149 { IPV6_BOUND_IF, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 0,
150 	sizeof (int),	0 /* no ifindex */ },
151 
152 { IP_DONTFRAG, IPPROTO_IP, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
153 
154 { IP_NEXTHOP, IPPROTO_IP, OA_R, OA_RW, OP_CONFIG, 0,
155 	sizeof (in_addr_t),	-1 /* not initialized  */ },
156 
157 { IPV6_UNSPEC_SRC, IPPROTO_IPV6, OA_R, OA_RW, OP_RAW, 0,
158 	sizeof (int), 0 },
159 
160 { IPV6_PKTINFO, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP,
161 	(OP_NODEFAULT|OP_VARLEN),
162 	sizeof (struct in6_pktinfo), -1 /* not initialized */ },
163 { IPV6_NEXTHOP, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP,
164 	OP_NODEFAULT,
165 	sizeof (sin6_t), -1 /* not initialized */ },
166 { IPV6_HOPOPTS, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP,
167 	(OP_VARLEN|OP_NODEFAULT), 255*8,
168 	-1 /* not initialized */ },
169 { IPV6_DSTOPTS, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP,
170 	(OP_VARLEN|OP_NODEFAULT), 255*8,
171 	-1 /* not initialized */ },
172 { IPV6_RTHDRDSTOPTS, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP,
173 	(OP_VARLEN|OP_NODEFAULT), 255*8,
174 	-1 /* not initialized */ },
175 { IPV6_RTHDR, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP,
176 	(OP_VARLEN|OP_NODEFAULT), 255*8,
177 	-1 /* not initialized */ },
178 { IPV6_TCLASS, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP,
179 	OP_NODEFAULT,
180 	sizeof (int), -1 /* not initialized */ },
181 { IPV6_PATHMTU, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP,
182 	OP_NODEFAULT,
183 	sizeof (struct ip6_mtuinfo), -1 /* not initialized */ },
184 { IPV6_DONTFRAG, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 0,
185 	sizeof (int), 0 },
186 { IPV6_USE_MIN_MTU, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 0,
187 	sizeof (int), 0 },
188 { IPV6_V6ONLY, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 0,
189 	sizeof (int), 0 },
190 
191 /* Enable receipt of ancillary data */
192 { IPV6_RECVPKTINFO, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 0,
193 	sizeof (int), 0 },
194 { IPV6_RECVHOPLIMIT, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 0,
195 	sizeof (int), 0 },
196 { IPV6_RECVHOPOPTS, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 0,
197 	sizeof (int), 0 },
198 { _OLD_IPV6_RECVDSTOPTS, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 0,
199 	sizeof (int), 0 },
200 { IPV6_RECVDSTOPTS, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 0,
201 	sizeof (int), 0 },
202 { IPV6_RECVRTHDR, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 0,
203 	sizeof (int), 0 },
204 { IPV6_RECVRTHDRDSTOPTS, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 0,
205 	sizeof (int), 0 },
206 { IPV6_RECVTCLASS, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 0,
207 	sizeof (int), 0 },
208 
209 { IPV6_SEC_OPT, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, OP_NODEFAULT,
210 	sizeof (ipsec_req_t), -1 /* not initialized */ },
211 { IPV6_SRC_PREFERENCES, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 0,
212 	sizeof (uint32_t), IPV6_PREFER_SRC_DEFAULT },
213 };
214 
215 /*
216  * Table of all supported levels
217  * Note: Some levels (e.g. XTI_GENERIC) may be valid but may not have
218  * any supported options so we need this info separately.
219  *
220  * This is needed only for topmost tpi providers and is used only by
221  * XTI interfaces.
222  */
223 optlevel_t	tcp_valid_levels_arr[] = {
224 	XTI_GENERIC,
225 	SOL_SOCKET,
226 	IPPROTO_TCP,
227 	IPPROTO_IP,
228 	IPPROTO_IPV6
229 };
230 
231 
232 #define	TCP_OPT_ARR_CNT		A_CNT(tcp_opt_arr)
233 #define	TCP_VALID_LEVELS_CNT	A_CNT(tcp_valid_levels_arr)
234 
235 uint_t tcp_max_optsize; /* initialized when TCP driver is loaded */
236 
237 /*
238  * Initialize option database object for TCP
239  *
240  * This object represents database of options to search passed to
241  * {sock,tpi}optcom_req() interface routine to take care of option
242  * management and associated methods.
243  */
244 
245 optdb_obj_t tcp_opt_obj = {
246 	tcp_opt_default,	/* TCP default value function pointer */
247 	tcp_tpi_opt_get,	/* TCP get function pointer */
248 	tcp_tpi_opt_set,	/* TCP set function pointer */
249 	TCP_OPT_ARR_CNT,	/* TCP option database count of entries */
250 	tcp_opt_arr,		/* TCP option database */
251 	TCP_VALID_LEVELS_CNT,	/* TCP valid level count of entries */
252 	tcp_valid_levels_arr	/* TCP valid level array */
253 };
254 
255 /* Maximum TCP initial cwin (start/restart). */
256 #define	TCP_MAX_INIT_CWND	16
257 
258 static int tcp_max_init_cwnd = TCP_MAX_INIT_CWND;
259 
260 /*
261  * Some TCP options can be "set" by requesting them in the option
262  * buffer. This is needed for XTI feature test though we do not
263  * allow it in general. We interpret that this mechanism is more
264  * applicable to OSI protocols and need not be allowed in general.
265  * This routine filters out options for which it is not allowed (most)
266  * and lets through those (few) for which it is. [ The XTI interface
267  * test suite specifics will imply that any XTI_GENERIC level XTI_* if
268  * ever implemented will have to be allowed here ].
269  */
270 static boolean_t
271 tcp_allow_connopt_set(int level, int name)
272 {
273 
274 	switch (level) {
275 	case IPPROTO_TCP:
276 		switch (name) {
277 		case TCP_NODELAY:
278 			return (B_TRUE);
279 		default:
280 			return (B_FALSE);
281 		}
282 		/*NOTREACHED*/
283 	default:
284 		return (B_FALSE);
285 	}
286 	/*NOTREACHED*/
287 }
288 
289 /*
290  * This routine gets default values of certain options whose default
291  * values are maintained by protocol specific code
292  */
293 /* ARGSUSED */
294 int
295 tcp_opt_default(queue_t *q, int level, int name, uchar_t *ptr)
296 {
297 	int32_t	*i1 = (int32_t *)ptr;
298 	tcp_stack_t	*tcps = Q_TO_TCP(q)->tcp_tcps;
299 
300 	switch (level) {
301 	case IPPROTO_TCP:
302 		switch (name) {
303 		case TCP_NOTIFY_THRESHOLD:
304 			*i1 = tcps->tcps_ip_notify_interval;
305 			break;
306 		case TCP_ABORT_THRESHOLD:
307 			*i1 = tcps->tcps_ip_abort_interval;
308 			break;
309 		case TCP_CONN_NOTIFY_THRESHOLD:
310 			*i1 = tcps->tcps_ip_notify_cinterval;
311 			break;
312 		case TCP_CONN_ABORT_THRESHOLD:
313 			*i1 = tcps->tcps_ip_abort_cinterval;
314 			break;
315 		default:
316 			return (-1);
317 		}
318 		break;
319 	case IPPROTO_IP:
320 		switch (name) {
321 		case IP_TTL:
322 			*i1 = tcps->tcps_ipv4_ttl;
323 			break;
324 		default:
325 			return (-1);
326 		}
327 		break;
328 	case IPPROTO_IPV6:
329 		switch (name) {
330 		case IPV6_UNICAST_HOPS:
331 			*i1 = tcps->tcps_ipv6_hoplimit;
332 			break;
333 		default:
334 			return (-1);
335 		}
336 		break;
337 	default:
338 		return (-1);
339 	}
340 	return (sizeof (int));
341 }
342 
343 /*
344  * TCP routine to get the values of options.
345  */
346 int
347 tcp_opt_get(conn_t *connp, int level, int name, uchar_t *ptr)
348 {
349 	int		*i1 = (int *)ptr;
350 	tcp_t		*tcp = connp->conn_tcp;
351 	conn_opt_arg_t	coas;
352 	int		retval;
353 
354 	coas.coa_connp = connp;
355 	coas.coa_ixa = connp->conn_ixa;
356 	coas.coa_ipp = &connp->conn_xmit_ipp;
357 	coas.coa_ancillary = B_FALSE;
358 	coas.coa_changed = 0;
359 
360 	switch (level) {
361 	case SOL_SOCKET:
362 		switch (name) {
363 		case SO_SND_COPYAVOID:
364 			*i1 = tcp->tcp_snd_zcopy_on ?
365 			    SO_SND_COPYAVOID : 0;
366 			return (sizeof (int));
367 		case SO_ACCEPTCONN:
368 			*i1 = (tcp->tcp_state == TCPS_LISTEN);
369 			return (sizeof (int));
370 		}
371 		break;
372 	case IPPROTO_TCP:
373 		switch (name) {
374 		case TCP_NODELAY:
375 			*i1 = (tcp->tcp_naglim == 1) ? TCP_NODELAY : 0;
376 			return (sizeof (int));
377 		case TCP_MAXSEG:
378 			*i1 = tcp->tcp_mss;
379 			return (sizeof (int));
380 		case TCP_NOTIFY_THRESHOLD:
381 			*i1 = (int)tcp->tcp_first_timer_threshold;
382 			return (sizeof (int));
383 		case TCP_ABORT_THRESHOLD:
384 			*i1 = tcp->tcp_second_timer_threshold;
385 			return (sizeof (int));
386 		case TCP_CONN_NOTIFY_THRESHOLD:
387 			*i1 = tcp->tcp_first_ctimer_threshold;
388 			return (sizeof (int));
389 		case TCP_CONN_ABORT_THRESHOLD:
390 			*i1 = tcp->tcp_second_ctimer_threshold;
391 			return (sizeof (int));
392 		case TCP_INIT_CWND:
393 			*i1 = tcp->tcp_init_cwnd;
394 			return (sizeof (int));
395 		case TCP_KEEPALIVE_THRESHOLD:
396 			*i1 = tcp->tcp_ka_interval;
397 			return (sizeof (int));
398 		case TCP_KEEPALIVE_ABORT_THRESHOLD:
399 			*i1 = tcp->tcp_ka_abort_thres;
400 			return (sizeof (int));
401 		case TCP_CORK:
402 			*i1 = tcp->tcp_cork;
403 			return (sizeof (int));
404 		}
405 		break;
406 	case IPPROTO_IP:
407 		if (connp->conn_family != AF_INET)
408 			return (-1);
409 		switch (name) {
410 		case IP_OPTIONS:
411 		case T_IP_OPTIONS:
412 			/* Caller ensures enough space */
413 			return (ip_opt_get_user(connp, ptr));
414 		default:
415 			break;
416 		}
417 		break;
418 
419 	case IPPROTO_IPV6:
420 		/*
421 		 * IPPROTO_IPV6 options are only supported for sockets
422 		 * that are using IPv6 on the wire.
423 		 */
424 		if (connp->conn_ipversion != IPV6_VERSION) {
425 			return (-1);
426 		}
427 		switch (name) {
428 		case IPV6_PATHMTU:
429 			if (tcp->tcp_state < TCPS_ESTABLISHED)
430 				return (-1);
431 			break;
432 		}
433 		break;
434 	}
435 	mutex_enter(&connp->conn_lock);
436 	retval = conn_opt_get(&coas, level, name, ptr);
437 	mutex_exit(&connp->conn_lock);
438 	return (retval);
439 }
440 
441 /*
442  * We declare as 'int' rather than 'void' to satisfy pfi_t arg requirements.
443  * Parameters are assumed to be verified by the caller.
444  */
445 /* ARGSUSED */
446 int
447 tcp_opt_set(conn_t *connp, uint_t optset_context, int level, int name,
448     uint_t inlen, uchar_t *invalp, uint_t *outlenp, uchar_t *outvalp,
449     void *thisdg_attrs, cred_t *cr)
450 {
451 	tcp_t	*tcp = connp->conn_tcp;
452 	int	*i1 = (int *)invalp;
453 	boolean_t onoff = (*i1 == 0) ? 0 : 1;
454 	boolean_t checkonly;
455 	int	reterr;
456 	tcp_stack_t	*tcps = tcp->tcp_tcps;
457 	conn_opt_arg_t	coas;
458 
459 	coas.coa_connp = connp;
460 	coas.coa_ixa = connp->conn_ixa;
461 	coas.coa_ipp = &connp->conn_xmit_ipp;
462 	coas.coa_ancillary = B_FALSE;
463 	coas.coa_changed = 0;
464 
465 	switch (optset_context) {
466 	case SETFN_OPTCOM_CHECKONLY:
467 		checkonly = B_TRUE;
468 		/*
469 		 * Note: Implies T_CHECK semantics for T_OPTCOM_REQ
470 		 * inlen != 0 implies value supplied and
471 		 * 	we have to "pretend" to set it.
472 		 * inlen == 0 implies that there is no
473 		 * 	value part in T_CHECK request and just validation
474 		 * done elsewhere should be enough, we just return here.
475 		 */
476 		if (inlen == 0) {
477 			*outlenp = 0;
478 			return (0);
479 		}
480 		break;
481 	case SETFN_OPTCOM_NEGOTIATE:
482 		checkonly = B_FALSE;
483 		break;
484 	case SETFN_UD_NEGOTIATE: /* error on conn-oriented transports ? */
485 	case SETFN_CONN_NEGOTIATE:
486 		checkonly = B_FALSE;
487 		/*
488 		 * Negotiating local and "association-related" options
489 		 * from other (T_CONN_REQ, T_CONN_RES,T_UNITDATA_REQ)
490 		 * primitives is allowed by XTI, but we choose
491 		 * to not implement this style negotiation for Internet
492 		 * protocols (We interpret it is a must for OSI world but
493 		 * optional for Internet protocols) for all options.
494 		 * [ Will do only for the few options that enable test
495 		 * suites that our XTI implementation of this feature
496 		 * works for transports that do allow it ]
497 		 */
498 		if (!tcp_allow_connopt_set(level, name)) {
499 			*outlenp = 0;
500 			return (EINVAL);
501 		}
502 		break;
503 	default:
504 		/*
505 		 * We should never get here
506 		 */
507 		*outlenp = 0;
508 		return (EINVAL);
509 	}
510 
511 	ASSERT((optset_context != SETFN_OPTCOM_CHECKONLY) ||
512 	    (optset_context == SETFN_OPTCOM_CHECKONLY && inlen != 0));
513 
514 	/*
515 	 * For TCP, we should have no ancillary data sent down
516 	 * (sendmsg isn't supported for SOCK_STREAM), so thisdg_attrs
517 	 * has to be zero.
518 	 */
519 	ASSERT(thisdg_attrs == NULL);
520 
521 	/*
522 	 * For fixed length options, no sanity check
523 	 * of passed in length is done. It is assumed *_optcom_req()
524 	 * routines do the right thing.
525 	 */
526 	switch (level) {
527 	case SOL_SOCKET:
528 		switch (name) {
529 		case SO_KEEPALIVE:
530 			if (checkonly) {
531 				/* check only case */
532 				break;
533 			}
534 
535 			if (!onoff) {
536 				if (connp->conn_keepalive) {
537 					if (tcp->tcp_ka_tid != 0) {
538 						(void) TCP_TIMER_CANCEL(tcp,
539 						    tcp->tcp_ka_tid);
540 						tcp->tcp_ka_tid = 0;
541 					}
542 					connp->conn_keepalive = 0;
543 				}
544 				break;
545 			}
546 			if (!connp->conn_keepalive) {
547 				/* Crank up the keepalive timer */
548 				tcp->tcp_ka_last_intrvl = 0;
549 				tcp->tcp_ka_tid = TCP_TIMER(tcp,
550 				    tcp_keepalive_timer, tcp->tcp_ka_interval);
551 				connp->conn_keepalive = 1;
552 			}
553 			break;
554 		case SO_SNDBUF: {
555 			if (*i1 > tcps->tcps_max_buf) {
556 				*outlenp = 0;
557 				return (ENOBUFS);
558 			}
559 			if (checkonly)
560 				break;
561 
562 			connp->conn_sndbuf = *i1;
563 			if (tcps->tcps_snd_lowat_fraction != 0) {
564 				connp->conn_sndlowat = connp->conn_sndbuf /
565 				    tcps->tcps_snd_lowat_fraction;
566 			}
567 			(void) tcp_maxpsz_set(tcp, B_TRUE);
568 			/*
569 			 * If we are flow-controlled, recheck the condition.
570 			 * There are apps that increase SO_SNDBUF size when
571 			 * flow-controlled (EWOULDBLOCK), and expect the flow
572 			 * control condition to be lifted right away.
573 			 */
574 			mutex_enter(&tcp->tcp_non_sq_lock);
575 			if (tcp->tcp_flow_stopped &&
576 			    TCP_UNSENT_BYTES(tcp) < connp->conn_sndbuf) {
577 				tcp_clrqfull(tcp);
578 			}
579 			mutex_exit(&tcp->tcp_non_sq_lock);
580 			*outlenp = inlen;
581 			return (0);
582 		}
583 		case SO_RCVBUF:
584 			if (*i1 > tcps->tcps_max_buf) {
585 				*outlenp = 0;
586 				return (ENOBUFS);
587 			}
588 			/* Silently ignore zero */
589 			if (!checkonly && *i1 != 0) {
590 				*i1 = MSS_ROUNDUP(*i1, tcp->tcp_mss);
591 				(void) tcp_rwnd_set(tcp, *i1);
592 			}
593 			/*
594 			 * XXX should we return the rwnd here
595 			 * and tcp_opt_get ?
596 			 */
597 			*outlenp = inlen;
598 			return (0);
599 		case SO_SND_COPYAVOID:
600 			if (!checkonly) {
601 				if (tcp->tcp_loopback ||
602 				    (tcp->tcp_kssl_ctx != NULL) ||
603 				    (onoff != 1) || !tcp_zcopy_check(tcp)) {
604 					*outlenp = 0;
605 					return (EOPNOTSUPP);
606 				}
607 				tcp->tcp_snd_zcopy_aware = 1;
608 			}
609 			*outlenp = inlen;
610 			return (0);
611 		}
612 		break;
613 	case IPPROTO_TCP:
614 		switch (name) {
615 		case TCP_NODELAY:
616 			if (!checkonly)
617 				tcp->tcp_naglim = *i1 ? 1 : tcp->tcp_mss;
618 			break;
619 		case TCP_NOTIFY_THRESHOLD:
620 			if (!checkonly)
621 				tcp->tcp_first_timer_threshold = *i1;
622 			break;
623 		case TCP_ABORT_THRESHOLD:
624 			if (!checkonly)
625 				tcp->tcp_second_timer_threshold = *i1;
626 			break;
627 		case TCP_CONN_NOTIFY_THRESHOLD:
628 			if (!checkonly)
629 				tcp->tcp_first_ctimer_threshold = *i1;
630 			break;
631 		case TCP_CONN_ABORT_THRESHOLD:
632 			if (!checkonly)
633 				tcp->tcp_second_ctimer_threshold = *i1;
634 			break;
635 		case TCP_RECVDSTADDR:
636 			if (tcp->tcp_state > TCPS_LISTEN) {
637 				*outlenp = 0;
638 				return (EOPNOTSUPP);
639 			}
640 			/* Setting done in conn_opt_set */
641 			break;
642 		case TCP_INIT_CWND: {
643 			uint32_t init_cwnd = *((uint32_t *)invalp);
644 
645 			if (checkonly)
646 				break;
647 
648 			/*
649 			 * Only allow socket with network configuration
650 			 * privilege to set the initial cwnd to be larger
651 			 * than allowed by RFC 3390.
652 			 */
653 			if (init_cwnd <= MIN(4, MAX(2, 4380 / tcp->tcp_mss))) {
654 				tcp->tcp_init_cwnd = init_cwnd;
655 				break;
656 			}
657 			if ((reterr = secpolicy_ip_config(cr, B_TRUE)) != 0) {
658 				*outlenp = 0;
659 				return (reterr);
660 			}
661 			if (init_cwnd > tcp_max_init_cwnd) {
662 				*outlenp = 0;
663 				return (EINVAL);
664 			}
665 			tcp->tcp_init_cwnd = init_cwnd;
666 			break;
667 		}
668 		case TCP_KEEPALIVE_THRESHOLD:
669 			if (checkonly)
670 				break;
671 
672 			if (*i1 < tcps->tcps_keepalive_interval_low ||
673 			    *i1 > tcps->tcps_keepalive_interval_high) {
674 				*outlenp = 0;
675 				return (EINVAL);
676 			}
677 			if (*i1 != tcp->tcp_ka_interval) {
678 				tcp->tcp_ka_interval = *i1;
679 				/*
680 				 * Check if we need to restart the
681 				 * keepalive timer.
682 				 */
683 				if (tcp->tcp_ka_tid != 0) {
684 					ASSERT(connp->conn_keepalive);
685 					(void) TCP_TIMER_CANCEL(tcp,
686 					    tcp->tcp_ka_tid);
687 					tcp->tcp_ka_last_intrvl = 0;
688 					tcp->tcp_ka_tid = TCP_TIMER(tcp,
689 					    tcp_keepalive_timer,
690 					    tcp->tcp_ka_interval);
691 				}
692 			}
693 			break;
694 		case TCP_KEEPALIVE_ABORT_THRESHOLD:
695 			if (!checkonly) {
696 				if (*i1 <
697 				    tcps->tcps_keepalive_abort_interval_low ||
698 				    *i1 >
699 				    tcps->tcps_keepalive_abort_interval_high) {
700 					*outlenp = 0;
701 					return (EINVAL);
702 				}
703 				tcp->tcp_ka_abort_thres = *i1;
704 			}
705 			break;
706 		case TCP_CORK:
707 			if (!checkonly) {
708 				/*
709 				 * if tcp->tcp_cork was set and is now
710 				 * being unset, we have to make sure that
711 				 * the remaining data gets sent out. Also
712 				 * unset tcp->tcp_cork so that tcp_wput_data()
713 				 * can send data even if it is less than mss
714 				 */
715 				if (tcp->tcp_cork && onoff == 0 &&
716 				    tcp->tcp_unsent > 0) {
717 					tcp->tcp_cork = B_FALSE;
718 					tcp_wput_data(tcp, NULL, B_FALSE);
719 				}
720 				tcp->tcp_cork = onoff;
721 			}
722 			break;
723 		default:
724 			break;
725 		}
726 		break;
727 	case IPPROTO_IP:
728 		if (connp->conn_family != AF_INET) {
729 			*outlenp = 0;
730 			return (EINVAL);
731 		}
732 		switch (name) {
733 		case IP_SEC_OPT:
734 			/*
735 			 * We should not allow policy setting after
736 			 * we start listening for connections.
737 			 */
738 			if (tcp->tcp_state == TCPS_LISTEN) {
739 				return (EINVAL);
740 			}
741 			break;
742 		}
743 		break;
744 	case IPPROTO_IPV6:
745 		/*
746 		 * IPPROTO_IPV6 options are only supported for sockets
747 		 * that are using IPv6 on the wire.
748 		 */
749 		if (connp->conn_ipversion != IPV6_VERSION) {
750 			*outlenp = 0;
751 			return (EINVAL);
752 		}
753 
754 		switch (name) {
755 		case IPV6_RECVPKTINFO:
756 			if (!checkonly) {
757 				/* Force it to be sent up with the next msg */
758 				tcp->tcp_recvifindex = 0;
759 			}
760 			break;
761 		case IPV6_RECVTCLASS:
762 			if (!checkonly) {
763 				/* Force it to be sent up with the next msg */
764 				tcp->tcp_recvtclass = 0xffffffffU;
765 			}
766 			break;
767 		case IPV6_RECVHOPLIMIT:
768 			if (!checkonly) {
769 				/* Force it to be sent up with the next msg */
770 				tcp->tcp_recvhops = 0xffffffffU;
771 			}
772 			break;
773 		case IPV6_PKTINFO:
774 			/* This is an extra check for TCP */
775 			if (inlen == sizeof (struct in6_pktinfo)) {
776 				struct in6_pktinfo *pkti;
777 
778 				pkti = (struct in6_pktinfo *)invalp;
779 				/*
780 				 * RFC 3542 states that ipi6_addr must be
781 				 * the unspecified address when setting the
782 				 * IPV6_PKTINFO sticky socket option on a
783 				 * TCP socket.
784 				 */
785 				if (!IN6_IS_ADDR_UNSPECIFIED(&pkti->ipi6_addr))
786 					return (EINVAL);
787 			}
788 			break;
789 		case IPV6_SEC_OPT:
790 			/*
791 			 * We should not allow policy setting after
792 			 * we start listening for connections.
793 			 */
794 			if (tcp->tcp_state == TCPS_LISTEN) {
795 				return (EINVAL);
796 			}
797 			break;
798 		}
799 		break;
800 	}
801 	reterr = conn_opt_set(&coas, level, name, inlen, invalp,
802 	    checkonly, cr);
803 	if (reterr != 0) {
804 		*outlenp = 0;
805 		return (reterr);
806 	}
807 
808 	/*
809 	 * Common case of OK return with outval same as inval
810 	 */
811 	if (invalp != outvalp) {
812 		/* don't trust bcopy for identical src/dst */
813 		(void) bcopy(invalp, outvalp, inlen);
814 	}
815 	*outlenp = inlen;
816 
817 	if (coas.coa_changed & COA_HEADER_CHANGED) {
818 		/* If we are connected we rebuilt the headers */
819 		if (!IN6_IS_ADDR_UNSPECIFIED(&connp->conn_faddr_v6) &&
820 		    !IN6_IS_ADDR_V4MAPPED_ANY(&connp->conn_faddr_v6)) {
821 			reterr = tcp_build_hdrs(tcp);
822 			if (reterr != 0)
823 				return (reterr);
824 		}
825 	}
826 	if (coas.coa_changed & COA_ROUTE_CHANGED) {
827 		in6_addr_t nexthop;
828 
829 		/*
830 		 * If we are connected we re-cache the information.
831 		 * We ignore errors to preserve BSD behavior.
832 		 * Note that we don't redo IPsec policy lookup here
833 		 * since the final destination (or source) didn't change.
834 		 */
835 		ip_attr_nexthop(&connp->conn_xmit_ipp, connp->conn_ixa,
836 		    &connp->conn_faddr_v6, &nexthop);
837 
838 		if (!IN6_IS_ADDR_UNSPECIFIED(&connp->conn_faddr_v6) &&
839 		    !IN6_IS_ADDR_V4MAPPED_ANY(&connp->conn_faddr_v6)) {
840 			(void) ip_attr_connect(connp, connp->conn_ixa,
841 			    &connp->conn_laddr_v6, &connp->conn_faddr_v6,
842 			    &nexthop, connp->conn_fport, NULL, NULL,
843 			    IPDF_VERIFY_DST);
844 		}
845 	}
846 	if ((coas.coa_changed & COA_SNDBUF_CHANGED) && !IPCL_IS_NONSTR(connp)) {
847 		connp->conn_wq->q_hiwat = connp->conn_sndbuf;
848 	}
849 	if (coas.coa_changed & COA_WROFF_CHANGED) {
850 		connp->conn_wroff = connp->conn_ht_iphc_allocated +
851 		    tcps->tcps_wroff_xtra;
852 		(void) proto_set_tx_wroff(connp->conn_rq, connp,
853 		    connp->conn_wroff);
854 	}
855 	if (coas.coa_changed & COA_OOBINLINE_CHANGED) {
856 		if (IPCL_IS_NONSTR(connp))
857 			proto_set_rx_oob_opt(connp, onoff);
858 	}
859 	return (0);
860 }
861