xref: /titanic_41/usr/src/uts/common/inet/tcp/tcp_tunables.c (revision 0db3240d392634cfff2f95fb6da34b56b8dc574f)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 
26 #include <inet/ip.h>
27 #include <inet/tcp_impl.h>
28 #include <sys/multidata.h>
29 #include <sys/sunddi.h>
30 
31 /* Max size IP datagram is 64k - 1 */
32 #define	TCP_MSS_MAX_IPV4 (IP_MAXPACKET - (sizeof (ipha_t) + sizeof (tcpha_t)))
33 #define	TCP_MSS_MAX_IPV6 (IP_MAXPACKET - (sizeof (ip6_t) + sizeof (tcpha_t)))
34 
35 /* Max of the above */
36 #define	TCP_MSS_MAX		TCP_MSS_MAX_IPV4
37 
38 #define	TCP_XMIT_LOWATER	4096
39 #define	TCP_XMIT_HIWATER	49152
40 #define	TCP_RECV_LOWATER	2048
41 #define	TCP_RECV_HIWATER	128000
42 
43 /*
44  * Set the RFC 1948 pass phrase
45  */
46 /* ARGSUSED */
47 static int
48 tcp_set_1948phrase(void *cbarg,  cred_t *cr, mod_prop_info_t *pinfo,
49     const char *ifname, const void* pr_val, uint_t flags)
50 {
51 	tcp_stack_t	*tcps = (tcp_stack_t *)cbarg;
52 
53 	if (flags & MOD_PROP_DEFAULT)
54 		return (ENOTSUP);
55 
56 	/*
57 	 * Basically, value contains a new pass phrase.  Pass it along!
58 	 */
59 	tcp_iss_key_init((uint8_t *)pr_val, strlen(pr_val), tcps);
60 	return (0);
61 }
62 
63 /*
64  * returns the current list of listener limit configuration.
65  */
66 /* ARGSUSED */
67 static int
68 tcp_listener_conf_get(void *cbarg, mod_prop_info_t *pinfo, const char *ifname,
69     void *val, uint_t psize, uint_t flags)
70 {
71 	tcp_stack_t	*tcps = (tcp_stack_t *)cbarg;
72 	tcp_listener_t	*tl;
73 	char		*pval = val;
74 	size_t		nbytes = 0, tbytes = 0;
75 	uint_t		size;
76 	int		err = 0;
77 
78 	bzero(pval, psize);
79 	size = psize;
80 
81 	if (flags & (MOD_PROP_DEFAULT|MOD_PROP_PERM|MOD_PROP_POSSIBLE))
82 		return (0);
83 
84 	mutex_enter(&tcps->tcps_listener_conf_lock);
85 	for (tl = list_head(&tcps->tcps_listener_conf); tl != NULL;
86 	    tl = list_next(&tcps->tcps_listener_conf, tl)) {
87 		if (psize == size)
88 			nbytes = snprintf(pval, size, "%d:%d",  tl->tl_port,
89 			    tl->tl_ratio);
90 		else
91 			nbytes = snprintf(pval, size, ",%d:%d",  tl->tl_port,
92 			    tl->tl_ratio);
93 		size -= nbytes;
94 		pval += nbytes;
95 		tbytes += nbytes;
96 		if (tbytes >= psize) {
97 			/* Buffer overflow, stop copying information */
98 			err = ENOBUFS;
99 			break;
100 		}
101 	}
102 ret:
103 	mutex_exit(&tcps->tcps_listener_conf_lock);
104 	return (err);
105 }
106 
107 /*
108  * add a new listener limit configuration.
109  */
110 /* ARGSUSED */
111 static int
112 tcp_listener_conf_add(void *cbarg, cred_t *cr, mod_prop_info_t *pinfo,
113     const char *ifname, const void* pval, uint_t flags)
114 {
115 	tcp_listener_t	*new_tl;
116 	tcp_listener_t	*tl;
117 	long		lport;
118 	long		ratio;
119 	char		*colon;
120 	tcp_stack_t	*tcps = (tcp_stack_t *)cbarg;
121 
122 	if (flags & MOD_PROP_DEFAULT)
123 		return (ENOTSUP);
124 
125 	if (ddi_strtol(pval, &colon, 10, &lport) != 0 || lport <= 0 ||
126 	    lport > USHRT_MAX || *colon != ':') {
127 		return (EINVAL);
128 	}
129 	if (ddi_strtol(colon + 1, NULL, 10, &ratio) != 0 || ratio <= 0)
130 		return (EINVAL);
131 
132 	mutex_enter(&tcps->tcps_listener_conf_lock);
133 	for (tl = list_head(&tcps->tcps_listener_conf); tl != NULL;
134 	    tl = list_next(&tcps->tcps_listener_conf, tl)) {
135 		/* There is an existing entry, so update its ratio value. */
136 		if (tl->tl_port == lport) {
137 			tl->tl_ratio = ratio;
138 			mutex_exit(&tcps->tcps_listener_conf_lock);
139 			return (0);
140 		}
141 	}
142 
143 	if ((new_tl = kmem_alloc(sizeof (tcp_listener_t), KM_NOSLEEP)) ==
144 	    NULL) {
145 		mutex_exit(&tcps->tcps_listener_conf_lock);
146 		return (ENOMEM);
147 	}
148 
149 	new_tl->tl_port = lport;
150 	new_tl->tl_ratio = ratio;
151 	list_insert_tail(&tcps->tcps_listener_conf, new_tl);
152 	mutex_exit(&tcps->tcps_listener_conf_lock);
153 	return (0);
154 }
155 
156 /*
157  * remove a listener limit configuration.
158  */
159 /* ARGSUSED */
160 static int
161 tcp_listener_conf_del(void *cbarg, cred_t *cr, mod_prop_info_t *pinfo,
162     const char *ifname, const void* pval, uint_t flags)
163 {
164 	tcp_listener_t	*tl;
165 	long		lport;
166 	tcp_stack_t	*tcps = (tcp_stack_t *)cbarg;
167 
168 	if (flags & MOD_PROP_DEFAULT)
169 		return (ENOTSUP);
170 
171 	if (ddi_strtol(pval, NULL, 10, &lport) != 0 || lport <= 0 ||
172 	    lport > USHRT_MAX) {
173 		return (EINVAL);
174 	}
175 	mutex_enter(&tcps->tcps_listener_conf_lock);
176 	for (tl = list_head(&tcps->tcps_listener_conf); tl != NULL;
177 	    tl = list_next(&tcps->tcps_listener_conf, tl)) {
178 		if (tl->tl_port == lport) {
179 			list_remove(&tcps->tcps_listener_conf, tl);
180 			mutex_exit(&tcps->tcps_listener_conf_lock);
181 			kmem_free(tl, sizeof (tcp_listener_t));
182 			return (0);
183 		}
184 	}
185 	mutex_exit(&tcps->tcps_listener_conf_lock);
186 	return (ESRCH);
187 }
188 
189 /*
190  * All of these are alterable, within the min/max values given, at run time.
191  *
192  * Note: All those tunables which do not start with "tcp_" are Committed and
193  * therefore are public. See PSARC 2009/306.
194  */
195 mod_prop_info_t tcp_propinfo_tbl[] = {
196 	/* tunable - 0 */
197 	{ "tcp_time_wait_interval", MOD_PROTO_TCP,
198 	    mod_set_uint32, mod_get_uint32,
199 	    {1*SECONDS, 10*MINUTES, 1*MINUTES}, {1*MINUTES} },
200 
201 	{ "tcp_conn_req_max_q", MOD_PROTO_TCP,
202 	    mod_set_uint32, mod_get_uint32,
203 	    {1, UINT32_MAX, 128}, {128} },
204 
205 	{ "tcp_conn_req_max_q0", MOD_PROTO_TCP,
206 	    mod_set_uint32, mod_get_uint32,
207 	    {0, UINT32_MAX, 1024}, {1024} },
208 
209 	{ "tcp_conn_req_min", MOD_PROTO_TCP,
210 	    mod_set_uint32, mod_get_uint32,
211 	    {1, 1024, 1}, {1} },
212 
213 	{ "tcp_conn_grace_period", MOD_PROTO_TCP,
214 	    mod_set_uint32, mod_get_uint32,
215 	    {0*MS, 20*SECONDS, 0*MS}, {0*MS} },
216 
217 	{ "tcp_cwnd_max", MOD_PROTO_TCP,
218 	    mod_set_uint32, mod_get_uint32,
219 	    {128, (1<<30), 1024*1024}, {1024*1024} },
220 
221 	{ "tcp_debug", MOD_PROTO_TCP,
222 	    mod_set_uint32, mod_get_uint32,
223 	    {0, 10, 0}, {0} },
224 
225 	{ "smallest_nonpriv_port", MOD_PROTO_TCP,
226 	    mod_set_uint32, mod_get_uint32,
227 	    {1024, (32*1024), 1024}, {1024} },
228 
229 	{ "tcp_ip_abort_cinterval", MOD_PROTO_TCP,
230 	    mod_set_uint32, mod_get_uint32,
231 	    {1*SECONDS, UINT32_MAX, 3*MINUTES}, {3*MINUTES} },
232 
233 	{ "tcp_ip_abort_linterval", MOD_PROTO_TCP,
234 	    mod_set_uint32, mod_get_uint32,
235 	    {1*SECONDS, UINT32_MAX, 3*MINUTES}, {3*MINUTES} },
236 
237 	/* tunable - 10 */
238 	{ "tcp_ip_abort_interval", MOD_PROTO_TCP,
239 	    mod_set_uint32, mod_get_uint32,
240 	    {500*MS, UINT32_MAX, 5*MINUTES}, {5*MINUTES} },
241 
242 	{ "tcp_ip_notify_cinterval", MOD_PROTO_TCP,
243 	    mod_set_uint32, mod_get_uint32,
244 	    {1*SECONDS, UINT32_MAX, 10*SECONDS},
245 	    {10*SECONDS} },
246 
247 	{ "tcp_ip_notify_interval", MOD_PROTO_TCP,
248 	    mod_set_uint32, mod_get_uint32,
249 	    {500*MS, UINT32_MAX, 10*SECONDS}, {10*SECONDS} },
250 
251 	{ "tcp_ipv4_ttl", MOD_PROTO_TCP,
252 	    mod_set_uint32, mod_get_uint32,
253 	    {1, 255, 64}, {64} },
254 
255 	{ "tcp_keepalive_interval", MOD_PROTO_TCP,
256 	    mod_set_uint32, mod_get_uint32,
257 	    {10*SECONDS, 10*DAYS, 2*HOURS}, {2*HOURS} },
258 
259 	{ "tcp_maxpsz_multiplier", MOD_PROTO_TCP,
260 	    mod_set_uint32, mod_get_uint32,
261 	    {0, 100, 10}, {10} },
262 
263 	{ "tcp_mss_def_ipv4", MOD_PROTO_TCP,
264 	    mod_set_uint32, mod_get_uint32,
265 	    {1, TCP_MSS_MAX_IPV4, 536}, {536} },
266 
267 	{ "tcp_mss_max_ipv4", MOD_PROTO_TCP,
268 	    mod_set_uint32, mod_get_uint32,
269 	    {1, TCP_MSS_MAX_IPV4, TCP_MSS_MAX_IPV4},
270 	    {TCP_MSS_MAX_IPV4} },
271 
272 	{ "tcp_mss_min", MOD_PROTO_TCP,
273 	    mod_set_uint32, mod_get_uint32,
274 	    {1, TCP_MSS_MAX, 108}, {108} },
275 
276 	{ "tcp_naglim_def", MOD_PROTO_TCP,
277 	    mod_set_uint32, mod_get_uint32,
278 	    {1, (64*1024)-1, (4*1024)-1}, {(4*1024)-1} },
279 
280 	/* tunable - 20 */
281 	{ "tcp_rexmit_interval_initial", MOD_PROTO_TCP,
282 	    mod_set_uint32, mod_get_uint32,
283 	    {1*MS, 20*SECONDS, 1*SECONDS}, {1*SECONDS} },
284 
285 	{ "tcp_rexmit_interval_max", MOD_PROTO_TCP,
286 	    mod_set_uint32, mod_get_uint32,
287 	    {1*MS, 2*HOURS, 60*SECONDS}, {60*SECONDS} },
288 
289 	{ "tcp_rexmit_interval_min", MOD_PROTO_TCP,
290 	    mod_set_uint32, mod_get_uint32,
291 	    {1*MS, 2*HOURS, 400*MS}, {400*MS} },
292 
293 	{ "tcp_deferred_ack_interval", MOD_PROTO_TCP,
294 	    mod_set_uint32, mod_get_uint32,
295 	    {1*MS, 1*MINUTES, 100*MS}, {100*MS} },
296 
297 	{ "tcp_snd_lowat_fraction", MOD_PROTO_TCP,
298 	    mod_set_uint32, mod_get_uint32,
299 	    {0, 16, 0}, {0} },
300 
301 	{ "tcp_dupack_fast_retransmit", MOD_PROTO_TCP,
302 	    mod_set_uint32, mod_get_uint32,
303 	    {1, 10000, 3}, {3} },
304 
305 	{ "tcp_ignore_path_mtu", MOD_PROTO_TCP,
306 	    mod_set_boolean, mod_get_boolean,
307 	    {B_FALSE}, {B_FALSE} },
308 
309 	{ "smallest_anon_port", MOD_PROTO_TCP,
310 	    mod_set_uint32, mod_get_uint32,
311 	    {1024, ULP_MAX_PORT, 32*1024}, {32*1024} },
312 
313 	{ "largest_anon_port", MOD_PROTO_TCP,
314 	    mod_set_uint32, mod_get_uint32,
315 	    {1024, ULP_MAX_PORT, ULP_MAX_PORT},
316 	    {ULP_MAX_PORT} },
317 
318 	{ "send_maxbuf", MOD_PROTO_TCP,
319 	    mod_set_uint32, mod_get_uint32,
320 	    {TCP_XMIT_LOWATER, (1<<30), TCP_XMIT_HIWATER},
321 	    {TCP_XMIT_HIWATER} },
322 
323 	/* tunable - 30 */
324 	{ "tcp_xmit_lowat", MOD_PROTO_TCP,
325 	    mod_set_uint32, mod_get_uint32,
326 	    {TCP_XMIT_LOWATER, (1<<30), TCP_XMIT_LOWATER},
327 	    {TCP_XMIT_LOWATER} },
328 
329 	{ "recv_maxbuf", MOD_PROTO_TCP,
330 	    mod_set_uint32, mod_get_uint32,
331 	    {TCP_RECV_LOWATER, (1<<30), TCP_RECV_HIWATER},
332 	    {TCP_RECV_HIWATER} },
333 
334 	{ "tcp_recv_hiwat_minmss", MOD_PROTO_TCP,
335 	    mod_set_uint32, mod_get_uint32,
336 	    {1, 65536, 4}, {4} },
337 
338 	{ "tcp_fin_wait_2_flush_interval", MOD_PROTO_TCP,
339 	    mod_set_uint32, mod_get_uint32,
340 	    {1*SECONDS, UINT32_MAX, 675*SECONDS},
341 	    {675*SECONDS} },
342 
343 	{ "tcp_max_buf", MOD_PROTO_TCP,
344 	    mod_set_uint32, mod_get_uint32,
345 	    {8192, (1<<30), 1024*1024}, {1024*1024} },
346 
347 	/*
348 	 * Question:  What default value should I set for tcp_strong_iss?
349 	 */
350 	{ "tcp_strong_iss", MOD_PROTO_TCP,
351 	    mod_set_uint32, mod_get_uint32,
352 	    {0, 2, 1}, {1} },
353 
354 	{ "tcp_rtt_updates", MOD_PROTO_TCP,
355 	    mod_set_uint32, mod_get_uint32,
356 	    {0, 65536, 20}, {20} },
357 
358 	{ "tcp_wscale_always", MOD_PROTO_TCP,
359 	    mod_set_boolean, mod_get_boolean,
360 	    {B_TRUE}, {B_TRUE} },
361 
362 	{ "tcp_tstamp_always", MOD_PROTO_TCP,
363 	    mod_set_boolean, mod_get_boolean,
364 	    {B_FALSE}, {B_FALSE} },
365 
366 	{ "tcp_tstamp_if_wscale", MOD_PROTO_TCP,
367 	    mod_set_boolean, mod_get_boolean,
368 	    {B_TRUE}, {B_TRUE} },
369 
370 	/* tunable - 40 */
371 	{ "tcp_rexmit_interval_extra", MOD_PROTO_TCP,
372 	    mod_set_uint32, mod_get_uint32,
373 	    {0*MS, 2*HOURS, 0*MS}, {0*MS} },
374 
375 	{ "tcp_deferred_acks_max", MOD_PROTO_TCP,
376 	    mod_set_uint32, mod_get_uint32,
377 	    {0, 16, 2}, {2} },
378 
379 	{ "tcp_slow_start_after_idle", MOD_PROTO_TCP,
380 	    mod_set_uint32, mod_get_uint32,
381 	    {1, 16384, 4}, {4} },
382 
383 	{ "tcp_slow_start_initial", MOD_PROTO_TCP,
384 	    mod_set_uint32, mod_get_uint32,
385 	    {1, 4, 4}, {4} },
386 
387 	{ "sack", MOD_PROTO_TCP,
388 	    mod_set_uint32, mod_get_uint32,
389 	    {0, 2, 2}, {2} },
390 
391 	{ "tcp_ipv6_hoplimit", MOD_PROTO_TCP,
392 	    mod_set_uint32, mod_get_uint32,
393 	    {0, IPV6_MAX_HOPS, IPV6_DEFAULT_HOPS},
394 	    {IPV6_DEFAULT_HOPS} },
395 
396 	{ "tcp_mss_def_ipv6", MOD_PROTO_TCP,
397 	    mod_set_uint32, mod_get_uint32,
398 	    {1, TCP_MSS_MAX_IPV6, 1220}, {1220} },
399 
400 	{ "tcp_mss_max_ipv6", MOD_PROTO_TCP,
401 	    mod_set_uint32, mod_get_uint32,
402 	    {1, TCP_MSS_MAX_IPV6, TCP_MSS_MAX_IPV6},
403 	    {TCP_MSS_MAX_IPV6} },
404 
405 	{ "tcp_rev_src_routes", MOD_PROTO_TCP,
406 	    mod_set_boolean, mod_get_boolean,
407 	    {B_FALSE}, {B_FALSE} },
408 
409 	{ "tcp_local_dack_interval", MOD_PROTO_TCP,
410 	    mod_set_uint32, mod_get_uint32,
411 	    {10*MS, 500*MS, 50*MS}, {50*MS} },
412 
413 	/* tunable - 50 */
414 	{ "tcp_local_dacks_max", MOD_PROTO_TCP,
415 	    mod_set_uint32, mod_get_uint32,
416 	    {0, 16, 8}, {8} },
417 
418 	{ "ecn", MOD_PROTO_TCP,
419 	    mod_set_uint32, mod_get_uint32,
420 	    {0, 2, 1}, {1} },
421 
422 	{ "tcp_rst_sent_rate_enabled", MOD_PROTO_TCP,
423 	    mod_set_boolean, mod_get_boolean,
424 	    {B_TRUE}, {B_TRUE} },
425 
426 	{ "tcp_rst_sent_rate", MOD_PROTO_TCP,
427 	    mod_set_uint32, mod_get_uint32,
428 	    {0, UINT32_MAX, 40}, {40} },
429 
430 	{ "tcp_push_timer_interval", MOD_PROTO_TCP,
431 	    mod_set_uint32, mod_get_uint32,
432 	    {0, 100*MS, 50*MS}, {50*MS} },
433 
434 	{ "tcp_use_smss_as_mss_opt", MOD_PROTO_TCP,
435 	    mod_set_boolean, mod_get_boolean,
436 	    {B_FALSE}, {B_FALSE} },
437 
438 	{ "tcp_keepalive_abort_interval", MOD_PROTO_TCP,
439 	    mod_set_uint32, mod_get_uint32,
440 	    {0, UINT32_MAX, 8*MINUTES}, {8*MINUTES} },
441 
442 	/*
443 	 * tcp_wroff_xtra is the extra space in front of TCP/IP header for link
444 	 * layer header.  It has to be a multiple of 8.
445 	 */
446 	{ "tcp_wroff_xtra", MOD_PROTO_TCP,
447 	    mod_set_aligned, mod_get_uint32,
448 	    {0, 256, 32}, {32} },
449 
450 	{ "tcp_dev_flow_ctl", MOD_PROTO_TCP,
451 	    mod_set_boolean, mod_get_boolean,
452 	    {B_FALSE}, {B_FALSE} },
453 
454 	{ "tcp_reass_timeout", MOD_PROTO_TCP,
455 	    mod_set_uint32, mod_get_uint32,
456 	    {0, UINT32_MAX, 100*SECONDS}, {100*SECONDS} },
457 
458 	/* tunable - 60 */
459 	{ "extra_priv_ports", MOD_PROTO_TCP,
460 	    mod_set_extra_privports, mod_get_extra_privports,
461 	    {1, ULP_MAX_PORT, 0}, {0} },
462 
463 	{ "tcp_1948_phrase", MOD_PROTO_TCP,
464 	    tcp_set_1948phrase, NULL, {0}, {0} },
465 
466 	{ "tcp_listener_limit_conf", MOD_PROTO_TCP,
467 	    NULL, tcp_listener_conf_get, {0}, {0} },
468 
469 	{ "tcp_listener_limit_conf_add", MOD_PROTO_TCP,
470 	    tcp_listener_conf_add, NULL, {0}, {0} },
471 
472 	{ "tcp_listener_limit_conf_del", MOD_PROTO_TCP,
473 	    tcp_listener_conf_del, NULL, {0}, {0} },
474 
475 	{ "?", MOD_PROTO_TCP, NULL, mod_get_allprop, {0}, {0} },
476 
477 	{ NULL, 0, NULL, NULL, {0}, {0} }
478 };
479 
480 int tcp_propinfo_count = A_CNT(tcp_propinfo_tbl);
481