xref: /titanic_44/usr/src/uts/common/inet/tcp/tcp_tunables.c (revision 74bf729631d9843cabb29019f16ac648de4aaa80)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright (c) 1991, 2010, Oracle and/or its affiliates. All rights reserved.
23  * Copyright (c) 2011, Joyent Inc. All rights reserved.
24  * Copyright 2013 Nexenta Systems, Inc.  All rights reserved.
25  */
26 /* Copyright (c) 1990 Mentat Inc. */
27 
28 #include <inet/ip.h>
29 #include <inet/tcp_impl.h>
30 #include <sys/multidata.h>
31 #include <sys/sunddi.h>
32 
33 /* Max size IP datagram is 64k - 1 */
34 #define	TCP_MSS_MAX_IPV4 (IP_MAXPACKET - (sizeof (ipha_t) + sizeof (tcpha_t)))
35 #define	TCP_MSS_MAX_IPV6 (IP_MAXPACKET - (sizeof (ip6_t) + sizeof (tcpha_t)))
36 
37 /* Max of the above */
38 #define	TCP_MSS_MAX		TCP_MSS_MAX_IPV4
39 
40 #define	TCP_XMIT_LOWATER	4096
41 #define	TCP_XMIT_HIWATER	49152
42 #define	TCP_RECV_LOWATER	2048
43 #define	TCP_RECV_HIWATER	128000
44 
45 /*
46  * Set the RFC 1948 pass phrase
47  */
48 /* ARGSUSED */
49 static int
50 tcp_set_1948phrase(void *cbarg,  cred_t *cr, mod_prop_info_t *pinfo,
51     const char *ifname, const void* pr_val, uint_t flags)
52 {
53 	tcp_stack_t	*tcps = (tcp_stack_t *)cbarg;
54 
55 	if (flags & MOD_PROP_DEFAULT)
56 		return (ENOTSUP);
57 
58 	/*
59 	 * Basically, value contains a new pass phrase.  Pass it along!
60 	 */
61 	tcp_iss_key_init((uint8_t *)pr_val, strlen(pr_val), tcps);
62 	return (0);
63 }
64 
65 /*
66  * returns the current list of listener limit configuration.
67  */
68 /* ARGSUSED */
69 static int
70 tcp_listener_conf_get(void *cbarg, mod_prop_info_t *pinfo, const char *ifname,
71     void *val, uint_t psize, uint_t flags)
72 {
73 	tcp_stack_t	*tcps = (tcp_stack_t *)cbarg;
74 	tcp_listener_t	*tl;
75 	char		*pval = val;
76 	size_t		nbytes = 0, tbytes = 0;
77 	uint_t		size;
78 	int		err = 0;
79 
80 	bzero(pval, psize);
81 	size = psize;
82 
83 	if (flags & (MOD_PROP_DEFAULT|MOD_PROP_PERM|MOD_PROP_POSSIBLE))
84 		return (0);
85 
86 	mutex_enter(&tcps->tcps_listener_conf_lock);
87 	for (tl = list_head(&tcps->tcps_listener_conf); tl != NULL;
88 	    tl = list_next(&tcps->tcps_listener_conf, tl)) {
89 		if (psize == size)
90 			nbytes = snprintf(pval, size, "%d:%d",  tl->tl_port,
91 			    tl->tl_ratio);
92 		else
93 			nbytes = snprintf(pval, size, ",%d:%d",  tl->tl_port,
94 			    tl->tl_ratio);
95 		size -= nbytes;
96 		pval += nbytes;
97 		tbytes += nbytes;
98 		if (tbytes >= psize) {
99 			/* Buffer overflow, stop copying information */
100 			err = ENOBUFS;
101 			break;
102 		}
103 	}
104 
105 	mutex_exit(&tcps->tcps_listener_conf_lock);
106 	return (err);
107 }
108 
109 /*
110  * add a new listener limit configuration.
111  */
112 /* ARGSUSED */
113 static int
114 tcp_listener_conf_add(void *cbarg, cred_t *cr, mod_prop_info_t *pinfo,
115     const char *ifname, const void* pval, uint_t flags)
116 {
117 	tcp_listener_t	*new_tl;
118 	tcp_listener_t	*tl;
119 	long		lport;
120 	long		ratio;
121 	char		*colon;
122 	tcp_stack_t	*tcps = (tcp_stack_t *)cbarg;
123 
124 	if (flags & MOD_PROP_DEFAULT)
125 		return (ENOTSUP);
126 
127 	if (ddi_strtol(pval, &colon, 10, &lport) != 0 || lport <= 0 ||
128 	    lport > USHRT_MAX || *colon != ':') {
129 		return (EINVAL);
130 	}
131 	if (ddi_strtol(colon + 1, NULL, 10, &ratio) != 0 || ratio <= 0)
132 		return (EINVAL);
133 
134 	mutex_enter(&tcps->tcps_listener_conf_lock);
135 	for (tl = list_head(&tcps->tcps_listener_conf); tl != NULL;
136 	    tl = list_next(&tcps->tcps_listener_conf, tl)) {
137 		/* There is an existing entry, so update its ratio value. */
138 		if (tl->tl_port == lport) {
139 			tl->tl_ratio = ratio;
140 			mutex_exit(&tcps->tcps_listener_conf_lock);
141 			return (0);
142 		}
143 	}
144 
145 	if ((new_tl = kmem_alloc(sizeof (tcp_listener_t), KM_NOSLEEP)) ==
146 	    NULL) {
147 		mutex_exit(&tcps->tcps_listener_conf_lock);
148 		return (ENOMEM);
149 	}
150 
151 	new_tl->tl_port = lport;
152 	new_tl->tl_ratio = ratio;
153 	list_insert_tail(&tcps->tcps_listener_conf, new_tl);
154 	mutex_exit(&tcps->tcps_listener_conf_lock);
155 	return (0);
156 }
157 
158 /*
159  * remove a listener limit configuration.
160  */
161 /* ARGSUSED */
162 static int
163 tcp_listener_conf_del(void *cbarg, cred_t *cr, mod_prop_info_t *pinfo,
164     const char *ifname, const void* pval, uint_t flags)
165 {
166 	tcp_listener_t	*tl;
167 	long		lport;
168 	tcp_stack_t	*tcps = (tcp_stack_t *)cbarg;
169 
170 	if (flags & MOD_PROP_DEFAULT)
171 		return (ENOTSUP);
172 
173 	if (ddi_strtol(pval, NULL, 10, &lport) != 0 || lport <= 0 ||
174 	    lport > USHRT_MAX) {
175 		return (EINVAL);
176 	}
177 	mutex_enter(&tcps->tcps_listener_conf_lock);
178 	for (tl = list_head(&tcps->tcps_listener_conf); tl != NULL;
179 	    tl = list_next(&tcps->tcps_listener_conf, tl)) {
180 		if (tl->tl_port == lport) {
181 			list_remove(&tcps->tcps_listener_conf, tl);
182 			mutex_exit(&tcps->tcps_listener_conf_lock);
183 			kmem_free(tl, sizeof (tcp_listener_t));
184 			return (0);
185 		}
186 	}
187 	mutex_exit(&tcps->tcps_listener_conf_lock);
188 	return (ESRCH);
189 }
190 
191 /*
192  * Special checkers for smallest/largest anonymous port so they don't
193  * ever happen to be (largest < smallest).
194  */
195 /* ARGSUSED */
196 static int
197 tcp_smallest_anon_set(void *cbarg, cred_t *cr, mod_prop_info_t *pinfo,
198     const char *ifname, const void *pval, uint_t flags)
199 {
200 	unsigned long new_value;
201 	tcp_stack_t *tcps = (tcp_stack_t *)cbarg;
202 	int err;
203 
204 	if ((err = mod_uint32_value(pval, pinfo, flags, &new_value)) != 0)
205 		return (err);
206 	/* mod_uint32_value() + pinfo guarantees we're in TCP port range. */
207 	if ((uint32_t)new_value > tcps->tcps_largest_anon_port)
208 		return (ERANGE);
209 	pinfo->prop_cur_uval = (uint32_t)new_value;
210 	return (0);
211 }
212 
213 /* ARGSUSED */
214 static int
215 tcp_largest_anon_set(void *cbarg, cred_t *cr, mod_prop_info_t *pinfo,
216     const char *ifname, const void *pval, uint_t flags)
217 {
218 	unsigned long new_value;
219 	tcp_stack_t *tcps = (tcp_stack_t *)cbarg;
220 	int err;
221 
222 	if ((err = mod_uint32_value(pval, pinfo, flags, &new_value)) != 0)
223 		return (err);
224 	/* mod_uint32_value() + pinfo guarantees we're in TCP port range. */
225 	if ((uint32_t)new_value < tcps->tcps_smallest_anon_port)
226 		return (ERANGE);
227 	pinfo->prop_cur_uval = (uint32_t)new_value;
228 	return (0);
229 }
230 
231 /*
232  * All of these are alterable, within the min/max values given, at run time.
233  *
234  * Note: All those tunables which do not start with "_" are Committed and
235  * therefore are public. See PSARC 2010/080.
236  */
237 mod_prop_info_t tcp_propinfo_tbl[] = {
238 	/* tunable - 0 */
239 	{ "_time_wait_interval", MOD_PROTO_TCP,
240 	    mod_set_uint32, mod_get_uint32,
241 	    {1*SECONDS, 10*MINUTES, 1*MINUTES}, {1*MINUTES} },
242 
243 	{ "_conn_req_max_q", MOD_PROTO_TCP,
244 	    mod_set_uint32, mod_get_uint32,
245 	    {1, UINT32_MAX, 128}, {128} },
246 
247 	{ "_conn_req_max_q0", MOD_PROTO_TCP,
248 	    mod_set_uint32, mod_get_uint32,
249 	    {0, UINT32_MAX, 1024}, {1024} },
250 
251 	{ "_conn_req_min", MOD_PROTO_TCP,
252 	    mod_set_uint32, mod_get_uint32,
253 	    {1, 1024, 1}, {1} },
254 
255 	{ "_conn_grace_period", MOD_PROTO_TCP,
256 	    mod_set_uint32, mod_get_uint32,
257 	    {0*MS, 20*SECONDS, 0*MS}, {0*MS} },
258 
259 	{ "_cwnd_max", MOD_PROTO_TCP,
260 	    mod_set_uint32, mod_get_uint32,
261 	    {128, (1<<30), 1024*1024}, {1024*1024} },
262 
263 	{ "_debug", MOD_PROTO_TCP,
264 	    mod_set_uint32, mod_get_uint32,
265 	    {0, 10, 0}, {0} },
266 
267 	{ "smallest_nonpriv_port", MOD_PROTO_TCP,
268 	    mod_set_uint32, mod_get_uint32,
269 	    {1024, (32*1024), 1024}, {1024} },
270 
271 	{ "_ip_abort_cinterval", MOD_PROTO_TCP,
272 	    mod_set_uint32, mod_get_uint32,
273 	    {1*SECONDS, UINT32_MAX, 3*MINUTES}, {3*MINUTES} },
274 
275 	{ "_ip_abort_linterval", MOD_PROTO_TCP,
276 	    mod_set_uint32, mod_get_uint32,
277 	    {1*SECONDS, UINT32_MAX, 3*MINUTES}, {3*MINUTES} },
278 
279 	/* tunable - 10 */
280 	{ "_ip_abort_interval", MOD_PROTO_TCP,
281 	    mod_set_uint32, mod_get_uint32,
282 	    {500*MS, UINT32_MAX, 5*MINUTES}, {5*MINUTES} },
283 
284 	{ "_ip_notify_cinterval", MOD_PROTO_TCP,
285 	    mod_set_uint32, mod_get_uint32,
286 	    {1*SECONDS, UINT32_MAX, 10*SECONDS},
287 	    {10*SECONDS} },
288 
289 	{ "_ip_notify_interval", MOD_PROTO_TCP,
290 	    mod_set_uint32, mod_get_uint32,
291 	    {500*MS, UINT32_MAX, 10*SECONDS}, {10*SECONDS} },
292 
293 	{ "_ipv4_ttl", MOD_PROTO_TCP,
294 	    mod_set_uint32, mod_get_uint32,
295 	    {1, 255, 64}, {64} },
296 
297 	{ "_keepalive_interval", MOD_PROTO_TCP,
298 	    mod_set_uint32, mod_get_uint32,
299 	    {10*SECONDS, 10*DAYS, 2*HOURS}, {2*HOURS} },
300 
301 	{ "_maxpsz_multiplier", MOD_PROTO_TCP,
302 	    mod_set_uint32, mod_get_uint32,
303 	    {0, 100, 10}, {10} },
304 
305 	{ "_mss_def_ipv4", MOD_PROTO_TCP,
306 	    mod_set_uint32, mod_get_uint32,
307 	    {1, TCP_MSS_MAX_IPV4, 536}, {536} },
308 
309 	{ "_mss_max_ipv4", MOD_PROTO_TCP,
310 	    mod_set_uint32, mod_get_uint32,
311 	    {1, TCP_MSS_MAX_IPV4, TCP_MSS_MAX_IPV4},
312 	    {TCP_MSS_MAX_IPV4} },
313 
314 	{ "_mss_min", MOD_PROTO_TCP,
315 	    mod_set_uint32, mod_get_uint32,
316 	    {1, TCP_MSS_MAX, 108}, {108} },
317 
318 	{ "_naglim_def", MOD_PROTO_TCP,
319 	    mod_set_uint32, mod_get_uint32,
320 	    {1, (64*1024)-1, (4*1024)-1}, {(4*1024)-1} },
321 
322 	/* tunable - 20 */
323 	{ "_rexmit_interval_initial", MOD_PROTO_TCP,
324 	    mod_set_uint32, mod_get_uint32,
325 	    {1*MS, 20*SECONDS, 1*SECONDS}, {1*SECONDS} },
326 
327 	{ "_rexmit_interval_max", MOD_PROTO_TCP,
328 	    mod_set_uint32, mod_get_uint32,
329 	    {1*MS, 2*HOURS, 60*SECONDS}, {60*SECONDS} },
330 
331 	{ "_rexmit_interval_min", MOD_PROTO_TCP,
332 	    mod_set_uint32, mod_get_uint32,
333 	    {1*MS, 2*HOURS, 400*MS}, {400*MS} },
334 
335 	{ "_deferred_ack_interval", MOD_PROTO_TCP,
336 	    mod_set_uint32, mod_get_uint32,
337 	    {1*MS, 1*MINUTES, 100*MS}, {100*MS} },
338 
339 	{ "_snd_lowat_fraction", MOD_PROTO_TCP,
340 	    mod_set_uint32, mod_get_uint32,
341 	    {0, 16, 0}, {0} },
342 
343 	{ "_dupack_fast_retransmit", MOD_PROTO_TCP,
344 	    mod_set_uint32, mod_get_uint32,
345 	    {1, 10000, 3}, {3} },
346 
347 	{ "_ignore_path_mtu", MOD_PROTO_TCP,
348 	    mod_set_boolean, mod_get_boolean,
349 	    {B_FALSE}, {B_FALSE} },
350 
351 	{ "smallest_anon_port", MOD_PROTO_TCP,
352 	    tcp_smallest_anon_set, mod_get_uint32,
353 	    {1024, ULP_MAX_PORT, 32*1024}, {32*1024} },
354 
355 	{ "largest_anon_port", MOD_PROTO_TCP,
356 	    tcp_largest_anon_set, mod_get_uint32,
357 	    {1024, ULP_MAX_PORT, ULP_MAX_PORT},
358 	    {ULP_MAX_PORT} },
359 
360 	{ "send_maxbuf", MOD_PROTO_TCP,
361 	    mod_set_uint32, mod_get_uint32,
362 	    {TCP_XMIT_LOWATER, (1<<30), TCP_XMIT_HIWATER},
363 	    {TCP_XMIT_HIWATER} },
364 
365 	/* tunable - 30 */
366 	{ "_xmit_lowat", MOD_PROTO_TCP,
367 	    mod_set_uint32, mod_get_uint32,
368 	    {TCP_XMIT_LOWATER, (1<<30), TCP_XMIT_LOWATER},
369 	    {TCP_XMIT_LOWATER} },
370 
371 	{ "recv_maxbuf", MOD_PROTO_TCP,
372 	    mod_set_uint32, mod_get_uint32,
373 	    {TCP_RECV_LOWATER, (1<<30), TCP_RECV_HIWATER},
374 	    {TCP_RECV_HIWATER} },
375 
376 	{ "_recv_hiwat_minmss", MOD_PROTO_TCP,
377 	    mod_set_uint32, mod_get_uint32,
378 	    {1, 65536, 4}, {4} },
379 
380 	{ "_fin_wait_2_flush_interval", MOD_PROTO_TCP,
381 	    mod_set_uint32, mod_get_uint32,
382 	    {1*SECONDS, 2*HOURS, 60*SECONDS},
383 	    {60*SECONDS} },
384 
385 	{ "_max_buf", MOD_PROTO_TCP,
386 	    mod_set_uint32, mod_get_uint32,
387 	    {8192, (1<<30), 1024*1024}, {1024*1024} },
388 
389 	/*
390 	 * Question:  What default value should I set for tcp_strong_iss?
391 	 */
392 	{ "_strong_iss", MOD_PROTO_TCP,
393 	    mod_set_uint32, mod_get_uint32,
394 	    {0, 2, 1}, {1} },
395 
396 	{ "_rtt_updates", MOD_PROTO_TCP,
397 	    mod_set_uint32, mod_get_uint32,
398 	    {0, 65536, 20}, {20} },
399 
400 	{ "_wscale_always", MOD_PROTO_TCP,
401 	    mod_set_boolean, mod_get_boolean,
402 	    {B_TRUE}, {B_TRUE} },
403 
404 	{ "_tstamp_always", MOD_PROTO_TCP,
405 	    mod_set_boolean, mod_get_boolean,
406 	    {B_FALSE}, {B_FALSE} },
407 
408 	{ "_tstamp_if_wscale", MOD_PROTO_TCP,
409 	    mod_set_boolean, mod_get_boolean,
410 	    {B_TRUE}, {B_TRUE} },
411 
412 	/* tunable - 40 */
413 	{ "_rexmit_interval_extra", MOD_PROTO_TCP,
414 	    mod_set_uint32, mod_get_uint32,
415 	    {0*MS, 2*HOURS, 0*MS}, {0*MS} },
416 
417 	{ "_deferred_acks_max", MOD_PROTO_TCP,
418 	    mod_set_uint32, mod_get_uint32,
419 	    {0, 16, 2}, {2} },
420 
421 	{ "_slow_start_after_idle", MOD_PROTO_TCP,
422 	    mod_set_uint32, mod_get_uint32,
423 	    {0, 16384, 0}, {0} },
424 
425 	{ "_slow_start_initial", MOD_PROTO_TCP,
426 	    mod_set_uint32, mod_get_uint32,
427 	    {0, 16, 0}, {0} },
428 
429 	{ "sack", MOD_PROTO_TCP,
430 	    mod_set_uint32, mod_get_uint32,
431 	    {0, 2, 2}, {2} },
432 
433 	{ "_ipv6_hoplimit", MOD_PROTO_TCP,
434 	    mod_set_uint32, mod_get_uint32,
435 	    {0, IPV6_MAX_HOPS, IPV6_DEFAULT_HOPS},
436 	    {IPV6_DEFAULT_HOPS} },
437 
438 	{ "_mss_def_ipv6", MOD_PROTO_TCP,
439 	    mod_set_uint32, mod_get_uint32,
440 	    {1, TCP_MSS_MAX_IPV6, 1220}, {1220} },
441 
442 	{ "_mss_max_ipv6", MOD_PROTO_TCP,
443 	    mod_set_uint32, mod_get_uint32,
444 	    {1, TCP_MSS_MAX_IPV6, TCP_MSS_MAX_IPV6},
445 	    {TCP_MSS_MAX_IPV6} },
446 
447 	{ "_rev_src_routes", MOD_PROTO_TCP,
448 	    mod_set_boolean, mod_get_boolean,
449 	    {B_FALSE}, {B_FALSE} },
450 
451 	{ "_local_dack_interval", MOD_PROTO_TCP,
452 	    mod_set_uint32, mod_get_uint32,
453 	    {10*MS, 500*MS, 50*MS}, {50*MS} },
454 
455 	/* tunable - 50 */
456 	{ "_local_dacks_max", MOD_PROTO_TCP,
457 	    mod_set_uint32, mod_get_uint32,
458 	    {0, 16, 8}, {8} },
459 
460 	{ "ecn", MOD_PROTO_TCP,
461 	    mod_set_uint32, mod_get_uint32,
462 	    {0, 2, 1}, {1} },
463 
464 	{ "_rst_sent_rate_enabled", MOD_PROTO_TCP,
465 	    mod_set_boolean, mod_get_boolean,
466 	    {B_TRUE}, {B_TRUE} },
467 
468 	{ "_rst_sent_rate", MOD_PROTO_TCP,
469 	    mod_set_uint32, mod_get_uint32,
470 	    {0, UINT32_MAX, 40}, {40} },
471 
472 	{ "_push_timer_interval", MOD_PROTO_TCP,
473 	    mod_set_uint32, mod_get_uint32,
474 	    {0, 100*MS, 50*MS}, {50*MS} },
475 
476 	{ "_use_smss_as_mss_opt", MOD_PROTO_TCP,
477 	    mod_set_boolean, mod_get_boolean,
478 	    {B_FALSE}, {B_FALSE} },
479 
480 	{ "_keepalive_abort_interval", MOD_PROTO_TCP,
481 	    mod_set_uint32, mod_get_uint32,
482 	    {0, UINT32_MAX, 8*MINUTES}, {8*MINUTES} },
483 
484 	/*
485 	 * tcp_wroff_xtra is the extra space in front of TCP/IP header for link
486 	 * layer header.  It has to be a multiple of 8.
487 	 */
488 	{ "_wroff_xtra", MOD_PROTO_TCP,
489 	    mod_set_aligned, mod_get_uint32,
490 	    {0, 256, 32}, {32} },
491 
492 	{ "_dev_flow_ctl", MOD_PROTO_TCP,
493 	    mod_set_boolean, mod_get_boolean,
494 	    {B_FALSE}, {B_FALSE} },
495 
496 	{ "_reass_timeout", MOD_PROTO_TCP,
497 	    mod_set_uint32, mod_get_uint32,
498 	    {0, UINT32_MAX, 100*SECONDS}, {100*SECONDS} },
499 
500 	/* tunable - 60 */
501 	{ "extra_priv_ports", MOD_PROTO_TCP,
502 	    mod_set_extra_privports, mod_get_extra_privports,
503 	    {1, ULP_MAX_PORT, 0}, {0} },
504 
505 	{ "_1948_phrase", MOD_PROTO_TCP,
506 	    tcp_set_1948phrase, NULL, {0}, {0} },
507 
508 	{ "_listener_limit_conf", MOD_PROTO_TCP,
509 	    NULL, tcp_listener_conf_get, {0}, {0} },
510 
511 	{ "_listener_limit_conf_add", MOD_PROTO_TCP,
512 	    tcp_listener_conf_add, NULL, {0}, {0} },
513 
514 	{ "_listener_limit_conf_del", MOD_PROTO_TCP,
515 	    tcp_listener_conf_del, NULL, {0}, {0} },
516 
517 	{ "_iss_incr", MOD_PROTO_TCP,
518 	    mod_set_uint32, mod_get_uint32,
519 	    {1, ISS_INCR, ISS_INCR},
520 	    {ISS_INCR} },
521 
522 	{ "?", MOD_PROTO_TCP, NULL, mod_get_allprop, {0}, {0} },
523 
524 	{ NULL, 0, NULL, NULL, {0}, {0} }
525 };
526 
527 int tcp_propinfo_count = A_CNT(tcp_propinfo_tbl);
528