xref: /titanic_41/usr/src/uts/common/inet/tcp/tcp_tunables.c (revision 05ead181677a01a3a118f8b89ce79361113e34cf)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
23  */
24 
25 #include <inet/ip.h>
26 #include <inet/tcp_impl.h>
27 #include <sys/multidata.h>
28 #include <sys/sunddi.h>
29 
30 /* Max size IP datagram is 64k - 1 */
31 #define	TCP_MSS_MAX_IPV4 (IP_MAXPACKET - (sizeof (ipha_t) + sizeof (tcpha_t)))
32 #define	TCP_MSS_MAX_IPV6 (IP_MAXPACKET - (sizeof (ip6_t) + sizeof (tcpha_t)))
33 
34 /* Max of the above */
35 #define	TCP_MSS_MAX		TCP_MSS_MAX_IPV4
36 
37 #define	TCP_XMIT_LOWATER	4096
38 #define	TCP_XMIT_HIWATER	49152
39 #define	TCP_RECV_LOWATER	2048
40 #define	TCP_RECV_HIWATER	128000
41 
42 /*
43  * Set the RFC 1948 pass phrase
44  */
45 /* ARGSUSED */
46 static int
47 tcp_set_1948phrase(void *cbarg,  cred_t *cr, mod_prop_info_t *pinfo,
48     const char *ifname, const void* pr_val, uint_t flags)
49 {
50 	tcp_stack_t	*tcps = (tcp_stack_t *)cbarg;
51 
52 	if (flags & MOD_PROP_DEFAULT)
53 		return (ENOTSUP);
54 
55 	/*
56 	 * Basically, value contains a new pass phrase.  Pass it along!
57 	 */
58 	tcp_iss_key_init((uint8_t *)pr_val, strlen(pr_val), tcps);
59 	return (0);
60 }
61 
62 /*
63  * returns the current list of listener limit configuration.
64  */
65 /* ARGSUSED */
66 static int
67 tcp_listener_conf_get(void *cbarg, mod_prop_info_t *pinfo, const char *ifname,
68     void *val, uint_t psize, uint_t flags)
69 {
70 	tcp_stack_t	*tcps = (tcp_stack_t *)cbarg;
71 	tcp_listener_t	*tl;
72 	char		*pval = val;
73 	size_t		nbytes = 0, tbytes = 0;
74 	uint_t		size;
75 	int		err = 0;
76 
77 	bzero(pval, psize);
78 	size = psize;
79 
80 	if (flags & (MOD_PROP_DEFAULT|MOD_PROP_PERM|MOD_PROP_POSSIBLE))
81 		return (0);
82 
83 	mutex_enter(&tcps->tcps_listener_conf_lock);
84 	for (tl = list_head(&tcps->tcps_listener_conf); tl != NULL;
85 	    tl = list_next(&tcps->tcps_listener_conf, tl)) {
86 		if (psize == size)
87 			nbytes = snprintf(pval, size, "%d:%d",  tl->tl_port,
88 			    tl->tl_ratio);
89 		else
90 			nbytes = snprintf(pval, size, ",%d:%d",  tl->tl_port,
91 			    tl->tl_ratio);
92 		size -= nbytes;
93 		pval += nbytes;
94 		tbytes += nbytes;
95 		if (tbytes >= psize) {
96 			/* Buffer overflow, stop copying information */
97 			err = ENOBUFS;
98 			break;
99 		}
100 	}
101 ret:
102 	mutex_exit(&tcps->tcps_listener_conf_lock);
103 	return (err);
104 }
105 
106 /*
107  * add a new listener limit configuration.
108  */
109 /* ARGSUSED */
110 static int
111 tcp_listener_conf_add(void *cbarg, cred_t *cr, mod_prop_info_t *pinfo,
112     const char *ifname, const void* pval, uint_t flags)
113 {
114 	tcp_listener_t	*new_tl;
115 	tcp_listener_t	*tl;
116 	long		lport;
117 	long		ratio;
118 	char		*colon;
119 	tcp_stack_t	*tcps = (tcp_stack_t *)cbarg;
120 
121 	if (flags & MOD_PROP_DEFAULT)
122 		return (ENOTSUP);
123 
124 	if (ddi_strtol(pval, &colon, 10, &lport) != 0 || lport <= 0 ||
125 	    lport > USHRT_MAX || *colon != ':') {
126 		return (EINVAL);
127 	}
128 	if (ddi_strtol(colon + 1, NULL, 10, &ratio) != 0 || ratio <= 0)
129 		return (EINVAL);
130 
131 	mutex_enter(&tcps->tcps_listener_conf_lock);
132 	for (tl = list_head(&tcps->tcps_listener_conf); tl != NULL;
133 	    tl = list_next(&tcps->tcps_listener_conf, tl)) {
134 		/* There is an existing entry, so update its ratio value. */
135 		if (tl->tl_port == lport) {
136 			tl->tl_ratio = ratio;
137 			mutex_exit(&tcps->tcps_listener_conf_lock);
138 			return (0);
139 		}
140 	}
141 
142 	if ((new_tl = kmem_alloc(sizeof (tcp_listener_t), KM_NOSLEEP)) ==
143 	    NULL) {
144 		mutex_exit(&tcps->tcps_listener_conf_lock);
145 		return (ENOMEM);
146 	}
147 
148 	new_tl->tl_port = lport;
149 	new_tl->tl_ratio = ratio;
150 	list_insert_tail(&tcps->tcps_listener_conf, new_tl);
151 	mutex_exit(&tcps->tcps_listener_conf_lock);
152 	return (0);
153 }
154 
155 /*
156  * remove a listener limit configuration.
157  */
158 /* ARGSUSED */
159 static int
160 tcp_listener_conf_del(void *cbarg, cred_t *cr, mod_prop_info_t *pinfo,
161     const char *ifname, const void* pval, uint_t flags)
162 {
163 	tcp_listener_t	*tl;
164 	long		lport;
165 	tcp_stack_t	*tcps = (tcp_stack_t *)cbarg;
166 
167 	if (flags & MOD_PROP_DEFAULT)
168 		return (ENOTSUP);
169 
170 	if (ddi_strtol(pval, NULL, 10, &lport) != 0 || lport <= 0 ||
171 	    lport > USHRT_MAX) {
172 		return (EINVAL);
173 	}
174 	mutex_enter(&tcps->tcps_listener_conf_lock);
175 	for (tl = list_head(&tcps->tcps_listener_conf); tl != NULL;
176 	    tl = list_next(&tcps->tcps_listener_conf, tl)) {
177 		if (tl->tl_port == lport) {
178 			list_remove(&tcps->tcps_listener_conf, tl);
179 			mutex_exit(&tcps->tcps_listener_conf_lock);
180 			kmem_free(tl, sizeof (tcp_listener_t));
181 			return (0);
182 		}
183 	}
184 	mutex_exit(&tcps->tcps_listener_conf_lock);
185 	return (ESRCH);
186 }
187 
188 /*
189  * All of these are alterable, within the min/max values given, at run time.
190  *
191  * Note: All those tunables which do not start with "tcp_" are Committed and
192  * therefore are public. See PSARC 2009/306.
193  */
194 mod_prop_info_t tcp_propinfo_tbl[] = {
195 	/* tunable - 0 */
196 	{ "tcp_time_wait_interval", MOD_PROTO_TCP,
197 	    mod_set_uint32, mod_get_uint32,
198 	    {1*SECONDS, 10*MINUTES, 1*MINUTES}, {1*MINUTES} },
199 
200 	{ "tcp_conn_req_max_q", MOD_PROTO_TCP,
201 	    mod_set_uint32, mod_get_uint32,
202 	    {1, UINT32_MAX, 128}, {128} },
203 
204 	{ "tcp_conn_req_max_q0", MOD_PROTO_TCP,
205 	    mod_set_uint32, mod_get_uint32,
206 	    {0, UINT32_MAX, 1024}, {1024} },
207 
208 	{ "tcp_conn_req_min", MOD_PROTO_TCP,
209 	    mod_set_uint32, mod_get_uint32,
210 	    {1, 1024, 1}, {1} },
211 
212 	{ "tcp_conn_grace_period", MOD_PROTO_TCP,
213 	    mod_set_uint32, mod_get_uint32,
214 	    {0*MS, 20*SECONDS, 0*MS}, {0*MS} },
215 
216 	{ "tcp_cwnd_max", MOD_PROTO_TCP,
217 	    mod_set_uint32, mod_get_uint32,
218 	    {128, (1<<30), 1024*1024}, {1024*1024} },
219 
220 	{ "tcp_debug", MOD_PROTO_TCP,
221 	    mod_set_uint32, mod_get_uint32,
222 	    {0, 10, 0}, {0} },
223 
224 	{ "smallest_nonpriv_port", MOD_PROTO_TCP,
225 	    mod_set_uint32, mod_get_uint32,
226 	    {1024, (32*1024), 1024}, {1024} },
227 
228 	{ "tcp_ip_abort_cinterval", MOD_PROTO_TCP,
229 	    mod_set_uint32, mod_get_uint32,
230 	    {1*SECONDS, UINT32_MAX, 3*MINUTES}, {3*MINUTES} },
231 
232 	{ "tcp_ip_abort_linterval", MOD_PROTO_TCP,
233 	    mod_set_uint32, mod_get_uint32,
234 	    {1*SECONDS, UINT32_MAX, 3*MINUTES}, {3*MINUTES} },
235 
236 	/* tunable - 10 */
237 	{ "tcp_ip_abort_interval", MOD_PROTO_TCP,
238 	    mod_set_uint32, mod_get_uint32,
239 	    {500*MS, UINT32_MAX, 5*MINUTES}, {5*MINUTES} },
240 
241 	{ "tcp_ip_notify_cinterval", MOD_PROTO_TCP,
242 	    mod_set_uint32, mod_get_uint32,
243 	    {1*SECONDS, UINT32_MAX, 10*SECONDS},
244 	    {10*SECONDS} },
245 
246 	{ "tcp_ip_notify_interval", MOD_PROTO_TCP,
247 	    mod_set_uint32, mod_get_uint32,
248 	    {500*MS, UINT32_MAX, 10*SECONDS}, {10*SECONDS} },
249 
250 	{ "tcp_ipv4_ttl", MOD_PROTO_TCP,
251 	    mod_set_uint32, mod_get_uint32,
252 	    {1, 255, 64}, {64} },
253 
254 	{ "tcp_keepalive_interval", MOD_PROTO_TCP,
255 	    mod_set_uint32, mod_get_uint32,
256 	    {10*SECONDS, 10*DAYS, 2*HOURS}, {2*HOURS} },
257 
258 	{ "tcp_maxpsz_multiplier", MOD_PROTO_TCP,
259 	    mod_set_uint32, mod_get_uint32,
260 	    {0, 100, 10}, {10} },
261 
262 	{ "tcp_mss_def_ipv4", MOD_PROTO_TCP,
263 	    mod_set_uint32, mod_get_uint32,
264 	    {1, TCP_MSS_MAX_IPV4, 536}, {536} },
265 
266 	{ "tcp_mss_max_ipv4", MOD_PROTO_TCP,
267 	    mod_set_uint32, mod_get_uint32,
268 	    {1, TCP_MSS_MAX_IPV4, TCP_MSS_MAX_IPV4},
269 	    {TCP_MSS_MAX_IPV4} },
270 
271 	{ "tcp_mss_min", MOD_PROTO_TCP,
272 	    mod_set_uint32, mod_get_uint32,
273 	    {1, TCP_MSS_MAX, 108}, {108} },
274 
275 	{ "tcp_naglim_def", MOD_PROTO_TCP,
276 	    mod_set_uint32, mod_get_uint32,
277 	    {1, (64*1024)-1, (4*1024)-1}, {(4*1024)-1} },
278 
279 	/* tunable - 20 */
280 	{ "tcp_rexmit_interval_initial", MOD_PROTO_TCP,
281 	    mod_set_uint32, mod_get_uint32,
282 	    {1*MS, 20*SECONDS, 1*SECONDS}, {1*SECONDS} },
283 
284 	{ "tcp_rexmit_interval_max", MOD_PROTO_TCP,
285 	    mod_set_uint32, mod_get_uint32,
286 	    {1*MS, 2*HOURS, 60*SECONDS}, {60*SECONDS} },
287 
288 	{ "tcp_rexmit_interval_min", MOD_PROTO_TCP,
289 	    mod_set_uint32, mod_get_uint32,
290 	    {1*MS, 2*HOURS, 400*MS}, {400*MS} },
291 
292 	{ "tcp_deferred_ack_interval", MOD_PROTO_TCP,
293 	    mod_set_uint32, mod_get_uint32,
294 	    {1*MS, 1*MINUTES, 100*MS}, {100*MS} },
295 
296 	{ "tcp_snd_lowat_fraction", MOD_PROTO_TCP,
297 	    mod_set_uint32, mod_get_uint32,
298 	    {0, 16, 0}, {0} },
299 
300 	{ "tcp_dupack_fast_retransmit", MOD_PROTO_TCP,
301 	    mod_set_uint32, mod_get_uint32,
302 	    {1, 10000, 3}, {3} },
303 
304 	{ "tcp_ignore_path_mtu", MOD_PROTO_TCP,
305 	    mod_set_boolean, mod_get_boolean,
306 	    {B_FALSE}, {B_FALSE} },
307 
308 	{ "smallest_anon_port", MOD_PROTO_TCP,
309 	    mod_set_uint32, mod_get_uint32,
310 	    {1024, ULP_MAX_PORT, 32*1024}, {32*1024} },
311 
312 	{ "largest_anon_port", MOD_PROTO_TCP,
313 	    mod_set_uint32, mod_get_uint32,
314 	    {1024, ULP_MAX_PORT, ULP_MAX_PORT},
315 	    {ULP_MAX_PORT} },
316 
317 	{ "send_maxbuf", MOD_PROTO_TCP,
318 	    mod_set_uint32, mod_get_uint32,
319 	    {TCP_XMIT_LOWATER, (1<<30), TCP_XMIT_HIWATER},
320 	    {TCP_XMIT_HIWATER} },
321 
322 	/* tunable - 30 */
323 	{ "tcp_xmit_lowat", MOD_PROTO_TCP,
324 	    mod_set_uint32, mod_get_uint32,
325 	    {TCP_XMIT_LOWATER, (1<<30), TCP_XMIT_LOWATER},
326 	    {TCP_XMIT_LOWATER} },
327 
328 	{ "recv_maxbuf", MOD_PROTO_TCP,
329 	    mod_set_uint32, mod_get_uint32,
330 	    {TCP_RECV_LOWATER, (1<<30), TCP_RECV_HIWATER},
331 	    {TCP_RECV_HIWATER} },
332 
333 	{ "tcp_recv_hiwat_minmss", MOD_PROTO_TCP,
334 	    mod_set_uint32, mod_get_uint32,
335 	    {1, 65536, 4}, {4} },
336 
337 	{ "tcp_fin_wait_2_flush_interval", MOD_PROTO_TCP,
338 	    mod_set_uint32, mod_get_uint32,
339 	    {1*SECONDS, 2*HOURS, 60*SECONDS},
340 	    {60*SECONDS} },
341 
342 	{ "tcp_max_buf", MOD_PROTO_TCP,
343 	    mod_set_uint32, mod_get_uint32,
344 	    {8192, (1<<30), 1024*1024}, {1024*1024} },
345 
346 	/*
347 	 * Question:  What default value should I set for tcp_strong_iss?
348 	 */
349 	{ "tcp_strong_iss", MOD_PROTO_TCP,
350 	    mod_set_uint32, mod_get_uint32,
351 	    {0, 2, 1}, {1} },
352 
353 	{ "tcp_rtt_updates", MOD_PROTO_TCP,
354 	    mod_set_uint32, mod_get_uint32,
355 	    {0, 65536, 20}, {20} },
356 
357 	{ "tcp_wscale_always", MOD_PROTO_TCP,
358 	    mod_set_boolean, mod_get_boolean,
359 	    {B_TRUE}, {B_TRUE} },
360 
361 	{ "tcp_tstamp_always", MOD_PROTO_TCP,
362 	    mod_set_boolean, mod_get_boolean,
363 	    {B_FALSE}, {B_FALSE} },
364 
365 	{ "tcp_tstamp_if_wscale", MOD_PROTO_TCP,
366 	    mod_set_boolean, mod_get_boolean,
367 	    {B_TRUE}, {B_TRUE} },
368 
369 	/* tunable - 40 */
370 	{ "tcp_rexmit_interval_extra", MOD_PROTO_TCP,
371 	    mod_set_uint32, mod_get_uint32,
372 	    {0*MS, 2*HOURS, 0*MS}, {0*MS} },
373 
374 	{ "tcp_deferred_acks_max", MOD_PROTO_TCP,
375 	    mod_set_uint32, mod_get_uint32,
376 	    {0, 16, 2}, {2} },
377 
378 	{ "tcp_slow_start_after_idle", MOD_PROTO_TCP,
379 	    mod_set_uint32, mod_get_uint32,
380 	    {1, 16384, 4}, {4} },
381 
382 	{ "tcp_slow_start_initial", MOD_PROTO_TCP,
383 	    mod_set_uint32, mod_get_uint32,
384 	    {1, 4, 4}, {4} },
385 
386 	{ "sack", MOD_PROTO_TCP,
387 	    mod_set_uint32, mod_get_uint32,
388 	    {0, 2, 2}, {2} },
389 
390 	{ "tcp_ipv6_hoplimit", MOD_PROTO_TCP,
391 	    mod_set_uint32, mod_get_uint32,
392 	    {0, IPV6_MAX_HOPS, IPV6_DEFAULT_HOPS},
393 	    {IPV6_DEFAULT_HOPS} },
394 
395 	{ "tcp_mss_def_ipv6", MOD_PROTO_TCP,
396 	    mod_set_uint32, mod_get_uint32,
397 	    {1, TCP_MSS_MAX_IPV6, 1220}, {1220} },
398 
399 	{ "tcp_mss_max_ipv6", MOD_PROTO_TCP,
400 	    mod_set_uint32, mod_get_uint32,
401 	    {1, TCP_MSS_MAX_IPV6, TCP_MSS_MAX_IPV6},
402 	    {TCP_MSS_MAX_IPV6} },
403 
404 	{ "tcp_rev_src_routes", MOD_PROTO_TCP,
405 	    mod_set_boolean, mod_get_boolean,
406 	    {B_FALSE}, {B_FALSE} },
407 
408 	{ "tcp_local_dack_interval", MOD_PROTO_TCP,
409 	    mod_set_uint32, mod_get_uint32,
410 	    {10*MS, 500*MS, 50*MS}, {50*MS} },
411 
412 	/* tunable - 50 */
413 	{ "tcp_local_dacks_max", MOD_PROTO_TCP,
414 	    mod_set_uint32, mod_get_uint32,
415 	    {0, 16, 8}, {8} },
416 
417 	{ "ecn", MOD_PROTO_TCP,
418 	    mod_set_uint32, mod_get_uint32,
419 	    {0, 2, 1}, {1} },
420 
421 	{ "tcp_rst_sent_rate_enabled", MOD_PROTO_TCP,
422 	    mod_set_boolean, mod_get_boolean,
423 	    {B_TRUE}, {B_TRUE} },
424 
425 	{ "tcp_rst_sent_rate", MOD_PROTO_TCP,
426 	    mod_set_uint32, mod_get_uint32,
427 	    {0, UINT32_MAX, 40}, {40} },
428 
429 	{ "tcp_push_timer_interval", MOD_PROTO_TCP,
430 	    mod_set_uint32, mod_get_uint32,
431 	    {0, 100*MS, 50*MS}, {50*MS} },
432 
433 	{ "tcp_use_smss_as_mss_opt", MOD_PROTO_TCP,
434 	    mod_set_boolean, mod_get_boolean,
435 	    {B_FALSE}, {B_FALSE} },
436 
437 	{ "tcp_keepalive_abort_interval", MOD_PROTO_TCP,
438 	    mod_set_uint32, mod_get_uint32,
439 	    {0, UINT32_MAX, 8*MINUTES}, {8*MINUTES} },
440 
441 	/*
442 	 * tcp_wroff_xtra is the extra space in front of TCP/IP header for link
443 	 * layer header.  It has to be a multiple of 8.
444 	 */
445 	{ "tcp_wroff_xtra", MOD_PROTO_TCP,
446 	    mod_set_aligned, mod_get_uint32,
447 	    {0, 256, 32}, {32} },
448 
449 	{ "tcp_dev_flow_ctl", MOD_PROTO_TCP,
450 	    mod_set_boolean, mod_get_boolean,
451 	    {B_FALSE}, {B_FALSE} },
452 
453 	{ "tcp_reass_timeout", MOD_PROTO_TCP,
454 	    mod_set_uint32, mod_get_uint32,
455 	    {0, UINT32_MAX, 100*SECONDS}, {100*SECONDS} },
456 
457 	/* tunable - 60 */
458 	{ "extra_priv_ports", MOD_PROTO_TCP,
459 	    mod_set_extra_privports, mod_get_extra_privports,
460 	    {1, ULP_MAX_PORT, 0}, {0} },
461 
462 	{ "tcp_1948_phrase", MOD_PROTO_TCP,
463 	    tcp_set_1948phrase, NULL, {0}, {0} },
464 
465 	{ "tcp_listener_limit_conf", MOD_PROTO_TCP,
466 	    NULL, tcp_listener_conf_get, {0}, {0} },
467 
468 	{ "tcp_listener_limit_conf_add", MOD_PROTO_TCP,
469 	    tcp_listener_conf_add, NULL, {0}, {0} },
470 
471 	{ "tcp_listener_limit_conf_del", MOD_PROTO_TCP,
472 	    tcp_listener_conf_del, NULL, {0}, {0} },
473 
474 	{ "?", MOD_PROTO_TCP, NULL, mod_get_allprop, {0}, {0} },
475 
476 	{ NULL, 0, NULL, NULL, {0}, {0} }
477 };
478 
479 int tcp_propinfo_count = A_CNT(tcp_propinfo_tbl);
480