xref: /titanic_50/usr/src/uts/common/inet/tcp/tcp_tunables.c (revision 21ad40f5447a73ac8a7ed2b9b66dd73ff1b088c1)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright (c) 1991, 2010, Oracle and/or its affiliates. All rights reserved.
23  * Copyright (c) 2011, Joyent Inc. All rights reserved.
24  */
25 /* Copyright (c) 1990 Mentat Inc. */
26 
27 #include <inet/ip.h>
28 #include <inet/tcp_impl.h>
29 #include <sys/multidata.h>
30 #include <sys/sunddi.h>
31 
32 /* Max size IP datagram is 64k - 1 */
33 #define	TCP_MSS_MAX_IPV4 (IP_MAXPACKET - (sizeof (ipha_t) + sizeof (tcpha_t)))
34 #define	TCP_MSS_MAX_IPV6 (IP_MAXPACKET - (sizeof (ip6_t) + sizeof (tcpha_t)))
35 
36 /* Max of the above */
37 #define	TCP_MSS_MAX		TCP_MSS_MAX_IPV4
38 
39 #define	TCP_XMIT_LOWATER	4096
40 #define	TCP_XMIT_HIWATER	49152
41 #define	TCP_RECV_LOWATER	2048
42 #define	TCP_RECV_HIWATER	128000
43 
44 /*
45  * Set the RFC 1948 pass phrase
46  */
47 /* ARGSUSED */
48 static int
49 tcp_set_1948phrase(void *cbarg,  cred_t *cr, mod_prop_info_t *pinfo,
50     const char *ifname, const void* pr_val, uint_t flags)
51 {
52 	tcp_stack_t	*tcps = (tcp_stack_t *)cbarg;
53 
54 	if (flags & MOD_PROP_DEFAULT)
55 		return (ENOTSUP);
56 
57 	/*
58 	 * Basically, value contains a new pass phrase.  Pass it along!
59 	 */
60 	tcp_iss_key_init((uint8_t *)pr_val, strlen(pr_val), tcps);
61 	return (0);
62 }
63 
64 /*
65  * returns the current list of listener limit configuration.
66  */
67 /* ARGSUSED */
68 static int
69 tcp_listener_conf_get(void *cbarg, mod_prop_info_t *pinfo, const char *ifname,
70     void *val, uint_t psize, uint_t flags)
71 {
72 	tcp_stack_t	*tcps = (tcp_stack_t *)cbarg;
73 	tcp_listener_t	*tl;
74 	char		*pval = val;
75 	size_t		nbytes = 0, tbytes = 0;
76 	uint_t		size;
77 	int		err = 0;
78 
79 	bzero(pval, psize);
80 	size = psize;
81 
82 	if (flags & (MOD_PROP_DEFAULT|MOD_PROP_PERM|MOD_PROP_POSSIBLE))
83 		return (0);
84 
85 	mutex_enter(&tcps->tcps_listener_conf_lock);
86 	for (tl = list_head(&tcps->tcps_listener_conf); tl != NULL;
87 	    tl = list_next(&tcps->tcps_listener_conf, tl)) {
88 		if (psize == size)
89 			nbytes = snprintf(pval, size, "%d:%d",  tl->tl_port,
90 			    tl->tl_ratio);
91 		else
92 			nbytes = snprintf(pval, size, ",%d:%d",  tl->tl_port,
93 			    tl->tl_ratio);
94 		size -= nbytes;
95 		pval += nbytes;
96 		tbytes += nbytes;
97 		if (tbytes >= psize) {
98 			/* Buffer overflow, stop copying information */
99 			err = ENOBUFS;
100 			break;
101 		}
102 	}
103 
104 	mutex_exit(&tcps->tcps_listener_conf_lock);
105 	return (err);
106 }
107 
108 /*
109  * add a new listener limit configuration.
110  */
111 /* ARGSUSED */
112 static int
113 tcp_listener_conf_add(void *cbarg, cred_t *cr, mod_prop_info_t *pinfo,
114     const char *ifname, const void* pval, uint_t flags)
115 {
116 	tcp_listener_t	*new_tl;
117 	tcp_listener_t	*tl;
118 	long		lport;
119 	long		ratio;
120 	char		*colon;
121 	tcp_stack_t	*tcps = (tcp_stack_t *)cbarg;
122 
123 	if (flags & MOD_PROP_DEFAULT)
124 		return (ENOTSUP);
125 
126 	if (ddi_strtol(pval, &colon, 10, &lport) != 0 || lport <= 0 ||
127 	    lport > USHRT_MAX || *colon != ':') {
128 		return (EINVAL);
129 	}
130 	if (ddi_strtol(colon + 1, NULL, 10, &ratio) != 0 || ratio <= 0)
131 		return (EINVAL);
132 
133 	mutex_enter(&tcps->tcps_listener_conf_lock);
134 	for (tl = list_head(&tcps->tcps_listener_conf); tl != NULL;
135 	    tl = list_next(&tcps->tcps_listener_conf, tl)) {
136 		/* There is an existing entry, so update its ratio value. */
137 		if (tl->tl_port == lport) {
138 			tl->tl_ratio = ratio;
139 			mutex_exit(&tcps->tcps_listener_conf_lock);
140 			return (0);
141 		}
142 	}
143 
144 	if ((new_tl = kmem_alloc(sizeof (tcp_listener_t), KM_NOSLEEP)) ==
145 	    NULL) {
146 		mutex_exit(&tcps->tcps_listener_conf_lock);
147 		return (ENOMEM);
148 	}
149 
150 	new_tl->tl_port = lport;
151 	new_tl->tl_ratio = ratio;
152 	list_insert_tail(&tcps->tcps_listener_conf, new_tl);
153 	mutex_exit(&tcps->tcps_listener_conf_lock);
154 	return (0);
155 }
156 
157 /*
158  * remove a listener limit configuration.
159  */
160 /* ARGSUSED */
161 static int
162 tcp_listener_conf_del(void *cbarg, cred_t *cr, mod_prop_info_t *pinfo,
163     const char *ifname, const void* pval, uint_t flags)
164 {
165 	tcp_listener_t	*tl;
166 	long		lport;
167 	tcp_stack_t	*tcps = (tcp_stack_t *)cbarg;
168 
169 	if (flags & MOD_PROP_DEFAULT)
170 		return (ENOTSUP);
171 
172 	if (ddi_strtol(pval, NULL, 10, &lport) != 0 || lport <= 0 ||
173 	    lport > USHRT_MAX) {
174 		return (EINVAL);
175 	}
176 	mutex_enter(&tcps->tcps_listener_conf_lock);
177 	for (tl = list_head(&tcps->tcps_listener_conf); tl != NULL;
178 	    tl = list_next(&tcps->tcps_listener_conf, tl)) {
179 		if (tl->tl_port == lport) {
180 			list_remove(&tcps->tcps_listener_conf, tl);
181 			mutex_exit(&tcps->tcps_listener_conf_lock);
182 			kmem_free(tl, sizeof (tcp_listener_t));
183 			return (0);
184 		}
185 	}
186 	mutex_exit(&tcps->tcps_listener_conf_lock);
187 	return (ESRCH);
188 }
189 
190 /*
191  * All of these are alterable, within the min/max values given, at run time.
192  *
193  * Note: All those tunables which do not start with "_" are Committed and
194  * therefore are public. See PSARC 2010/080.
195  */
196 mod_prop_info_t tcp_propinfo_tbl[] = {
197 	/* tunable - 0 */
198 	{ "_time_wait_interval", MOD_PROTO_TCP,
199 	    mod_set_uint32, mod_get_uint32,
200 	    {1*SECONDS, 10*MINUTES, 1*MINUTES}, {1*MINUTES} },
201 
202 	{ "_conn_req_max_q", MOD_PROTO_TCP,
203 	    mod_set_uint32, mod_get_uint32,
204 	    {1, UINT32_MAX, 128}, {128} },
205 
206 	{ "_conn_req_max_q0", MOD_PROTO_TCP,
207 	    mod_set_uint32, mod_get_uint32,
208 	    {0, UINT32_MAX, 1024}, {1024} },
209 
210 	{ "_conn_req_min", MOD_PROTO_TCP,
211 	    mod_set_uint32, mod_get_uint32,
212 	    {1, 1024, 1}, {1} },
213 
214 	{ "_conn_grace_period", MOD_PROTO_TCP,
215 	    mod_set_uint32, mod_get_uint32,
216 	    {0*MS, 20*SECONDS, 0*MS}, {0*MS} },
217 
218 	{ "_cwnd_max", MOD_PROTO_TCP,
219 	    mod_set_uint32, mod_get_uint32,
220 	    {128, (1<<30), 1024*1024}, {1024*1024} },
221 
222 	{ "_debug", MOD_PROTO_TCP,
223 	    mod_set_uint32, mod_get_uint32,
224 	    {0, 10, 0}, {0} },
225 
226 	{ "smallest_nonpriv_port", MOD_PROTO_TCP,
227 	    mod_set_uint32, mod_get_uint32,
228 	    {1024, (32*1024), 1024}, {1024} },
229 
230 	{ "_ip_abort_cinterval", MOD_PROTO_TCP,
231 	    mod_set_uint32, mod_get_uint32,
232 	    {1*SECONDS, UINT32_MAX, 3*MINUTES}, {3*MINUTES} },
233 
234 	{ "_ip_abort_linterval", MOD_PROTO_TCP,
235 	    mod_set_uint32, mod_get_uint32,
236 	    {1*SECONDS, UINT32_MAX, 3*MINUTES}, {3*MINUTES} },
237 
238 	/* tunable - 10 */
239 	{ "_ip_abort_interval", MOD_PROTO_TCP,
240 	    mod_set_uint32, mod_get_uint32,
241 	    {500*MS, UINT32_MAX, 5*MINUTES}, {5*MINUTES} },
242 
243 	{ "_ip_notify_cinterval", MOD_PROTO_TCP,
244 	    mod_set_uint32, mod_get_uint32,
245 	    {1*SECONDS, UINT32_MAX, 10*SECONDS},
246 	    {10*SECONDS} },
247 
248 	{ "_ip_notify_interval", MOD_PROTO_TCP,
249 	    mod_set_uint32, mod_get_uint32,
250 	    {500*MS, UINT32_MAX, 10*SECONDS}, {10*SECONDS} },
251 
252 	{ "_ipv4_ttl", MOD_PROTO_TCP,
253 	    mod_set_uint32, mod_get_uint32,
254 	    {1, 255, 64}, {64} },
255 
256 	{ "_keepalive_interval", MOD_PROTO_TCP,
257 	    mod_set_uint32, mod_get_uint32,
258 	    {10*SECONDS, 10*DAYS, 2*HOURS}, {2*HOURS} },
259 
260 	{ "_maxpsz_multiplier", MOD_PROTO_TCP,
261 	    mod_set_uint32, mod_get_uint32,
262 	    {0, 100, 10}, {10} },
263 
264 	{ "_mss_def_ipv4", MOD_PROTO_TCP,
265 	    mod_set_uint32, mod_get_uint32,
266 	    {1, TCP_MSS_MAX_IPV4, 536}, {536} },
267 
268 	{ "_mss_max_ipv4", MOD_PROTO_TCP,
269 	    mod_set_uint32, mod_get_uint32,
270 	    {1, TCP_MSS_MAX_IPV4, TCP_MSS_MAX_IPV4},
271 	    {TCP_MSS_MAX_IPV4} },
272 
273 	{ "_mss_min", MOD_PROTO_TCP,
274 	    mod_set_uint32, mod_get_uint32,
275 	    {1, TCP_MSS_MAX, 108}, {108} },
276 
277 	{ "_naglim_def", MOD_PROTO_TCP,
278 	    mod_set_uint32, mod_get_uint32,
279 	    {1, (64*1024)-1, (4*1024)-1}, {(4*1024)-1} },
280 
281 	/* tunable - 20 */
282 	{ "_rexmit_interval_initial", MOD_PROTO_TCP,
283 	    mod_set_uint32, mod_get_uint32,
284 	    {1*MS, 20*SECONDS, 1*SECONDS}, {1*SECONDS} },
285 
286 	{ "_rexmit_interval_max", MOD_PROTO_TCP,
287 	    mod_set_uint32, mod_get_uint32,
288 	    {1*MS, 2*HOURS, 60*SECONDS}, {60*SECONDS} },
289 
290 	{ "_rexmit_interval_min", MOD_PROTO_TCP,
291 	    mod_set_uint32, mod_get_uint32,
292 	    {1*MS, 2*HOURS, 400*MS}, {400*MS} },
293 
294 	{ "_deferred_ack_interval", MOD_PROTO_TCP,
295 	    mod_set_uint32, mod_get_uint32,
296 	    {1*MS, 1*MINUTES, 100*MS}, {100*MS} },
297 
298 	{ "_snd_lowat_fraction", MOD_PROTO_TCP,
299 	    mod_set_uint32, mod_get_uint32,
300 	    {0, 16, 0}, {0} },
301 
302 	{ "_dupack_fast_retransmit", MOD_PROTO_TCP,
303 	    mod_set_uint32, mod_get_uint32,
304 	    {1, 10000, 3}, {3} },
305 
306 	{ "_ignore_path_mtu", MOD_PROTO_TCP,
307 	    mod_set_boolean, mod_get_boolean,
308 	    {B_FALSE}, {B_FALSE} },
309 
310 	{ "smallest_anon_port", MOD_PROTO_TCP,
311 	    mod_set_uint32, mod_get_uint32,
312 	    {1024, ULP_MAX_PORT, 32*1024}, {32*1024} },
313 
314 	{ "largest_anon_port", MOD_PROTO_TCP,
315 	    mod_set_uint32, mod_get_uint32,
316 	    {1024, ULP_MAX_PORT, ULP_MAX_PORT},
317 	    {ULP_MAX_PORT} },
318 
319 	{ "send_maxbuf", MOD_PROTO_TCP,
320 	    mod_set_uint32, mod_get_uint32,
321 	    {TCP_XMIT_LOWATER, (1<<30), TCP_XMIT_HIWATER},
322 	    {TCP_XMIT_HIWATER} },
323 
324 	/* tunable - 30 */
325 	{ "_xmit_lowat", MOD_PROTO_TCP,
326 	    mod_set_uint32, mod_get_uint32,
327 	    {TCP_XMIT_LOWATER, (1<<30), TCP_XMIT_LOWATER},
328 	    {TCP_XMIT_LOWATER} },
329 
330 	{ "recv_maxbuf", MOD_PROTO_TCP,
331 	    mod_set_uint32, mod_get_uint32,
332 	    {TCP_RECV_LOWATER, (1<<30), TCP_RECV_HIWATER},
333 	    {TCP_RECV_HIWATER} },
334 
335 	{ "_recv_hiwat_minmss", MOD_PROTO_TCP,
336 	    mod_set_uint32, mod_get_uint32,
337 	    {1, 65536, 4}, {4} },
338 
339 	{ "_fin_wait_2_flush_interval", MOD_PROTO_TCP,
340 	    mod_set_uint32, mod_get_uint32,
341 	    {1*SECONDS, 2*HOURS, 60*SECONDS},
342 	    {60*SECONDS} },
343 
344 	{ "_max_buf", MOD_PROTO_TCP,
345 	    mod_set_uint32, mod_get_uint32,
346 	    {8192, (1<<30), 1024*1024}, {1024*1024} },
347 
348 	/*
349 	 * Question:  What default value should I set for tcp_strong_iss?
350 	 */
351 	{ "_strong_iss", MOD_PROTO_TCP,
352 	    mod_set_uint32, mod_get_uint32,
353 	    {0, 2, 1}, {1} },
354 
355 	{ "_rtt_updates", MOD_PROTO_TCP,
356 	    mod_set_uint32, mod_get_uint32,
357 	    {0, 65536, 20}, {20} },
358 
359 	{ "_wscale_always", MOD_PROTO_TCP,
360 	    mod_set_boolean, mod_get_boolean,
361 	    {B_TRUE}, {B_TRUE} },
362 
363 	{ "_tstamp_always", MOD_PROTO_TCP,
364 	    mod_set_boolean, mod_get_boolean,
365 	    {B_FALSE}, {B_FALSE} },
366 
367 	{ "_tstamp_if_wscale", MOD_PROTO_TCP,
368 	    mod_set_boolean, mod_get_boolean,
369 	    {B_TRUE}, {B_TRUE} },
370 
371 	/* tunable - 40 */
372 	{ "_rexmit_interval_extra", MOD_PROTO_TCP,
373 	    mod_set_uint32, mod_get_uint32,
374 	    {0*MS, 2*HOURS, 0*MS}, {0*MS} },
375 
376 	{ "_deferred_acks_max", MOD_PROTO_TCP,
377 	    mod_set_uint32, mod_get_uint32,
378 	    {0, 16, 2}, {2} },
379 
380 	{ "_slow_start_after_idle", MOD_PROTO_TCP,
381 	    mod_set_uint32, mod_get_uint32,
382 	    {1, 16384, 4}, {4} },
383 
384 	{ "_slow_start_initial", MOD_PROTO_TCP,
385 	    mod_set_uint32, mod_get_uint32,
386 	    {1, 4, 4}, {4} },
387 
388 	{ "sack", MOD_PROTO_TCP,
389 	    mod_set_uint32, mod_get_uint32,
390 	    {0, 2, 2}, {2} },
391 
392 	{ "_ipv6_hoplimit", MOD_PROTO_TCP,
393 	    mod_set_uint32, mod_get_uint32,
394 	    {0, IPV6_MAX_HOPS, IPV6_DEFAULT_HOPS},
395 	    {IPV6_DEFAULT_HOPS} },
396 
397 	{ "_mss_def_ipv6", MOD_PROTO_TCP,
398 	    mod_set_uint32, mod_get_uint32,
399 	    {1, TCP_MSS_MAX_IPV6, 1220}, {1220} },
400 
401 	{ "_mss_max_ipv6", MOD_PROTO_TCP,
402 	    mod_set_uint32, mod_get_uint32,
403 	    {1, TCP_MSS_MAX_IPV6, TCP_MSS_MAX_IPV6},
404 	    {TCP_MSS_MAX_IPV6} },
405 
406 	{ "_rev_src_routes", MOD_PROTO_TCP,
407 	    mod_set_boolean, mod_get_boolean,
408 	    {B_FALSE}, {B_FALSE} },
409 
410 	{ "_local_dack_interval", MOD_PROTO_TCP,
411 	    mod_set_uint32, mod_get_uint32,
412 	    {10*MS, 500*MS, 50*MS}, {50*MS} },
413 
414 	/* tunable - 50 */
415 	{ "_local_dacks_max", MOD_PROTO_TCP,
416 	    mod_set_uint32, mod_get_uint32,
417 	    {0, 16, 8}, {8} },
418 
419 	{ "ecn", MOD_PROTO_TCP,
420 	    mod_set_uint32, mod_get_uint32,
421 	    {0, 2, 1}, {1} },
422 
423 	{ "_rst_sent_rate_enabled", MOD_PROTO_TCP,
424 	    mod_set_boolean, mod_get_boolean,
425 	    {B_TRUE}, {B_TRUE} },
426 
427 	{ "_rst_sent_rate", MOD_PROTO_TCP,
428 	    mod_set_uint32, mod_get_uint32,
429 	    {0, UINT32_MAX, 40}, {40} },
430 
431 	{ "_push_timer_interval", MOD_PROTO_TCP,
432 	    mod_set_uint32, mod_get_uint32,
433 	    {0, 100*MS, 50*MS}, {50*MS} },
434 
435 	{ "_use_smss_as_mss_opt", MOD_PROTO_TCP,
436 	    mod_set_boolean, mod_get_boolean,
437 	    {B_FALSE}, {B_FALSE} },
438 
439 	{ "_keepalive_abort_interval", MOD_PROTO_TCP,
440 	    mod_set_uint32, mod_get_uint32,
441 	    {0, UINT32_MAX, 8*MINUTES}, {8*MINUTES} },
442 
443 	/*
444 	 * tcp_wroff_xtra is the extra space in front of TCP/IP header for link
445 	 * layer header.  It has to be a multiple of 8.
446 	 */
447 	{ "_wroff_xtra", MOD_PROTO_TCP,
448 	    mod_set_aligned, mod_get_uint32,
449 	    {0, 256, 32}, {32} },
450 
451 	{ "_dev_flow_ctl", MOD_PROTO_TCP,
452 	    mod_set_boolean, mod_get_boolean,
453 	    {B_FALSE}, {B_FALSE} },
454 
455 	{ "_reass_timeout", MOD_PROTO_TCP,
456 	    mod_set_uint32, mod_get_uint32,
457 	    {0, UINT32_MAX, 100*SECONDS}, {100*SECONDS} },
458 
459 	/* tunable - 60 */
460 	{ "extra_priv_ports", MOD_PROTO_TCP,
461 	    mod_set_extra_privports, mod_get_extra_privports,
462 	    {1, ULP_MAX_PORT, 0}, {0} },
463 
464 	{ "_1948_phrase", MOD_PROTO_TCP,
465 	    tcp_set_1948phrase, NULL, {0}, {0} },
466 
467 	{ "_listener_limit_conf", MOD_PROTO_TCP,
468 	    NULL, tcp_listener_conf_get, {0}, {0} },
469 
470 	{ "_listener_limit_conf_add", MOD_PROTO_TCP,
471 	    tcp_listener_conf_add, NULL, {0}, {0} },
472 
473 	{ "_listener_limit_conf_del", MOD_PROTO_TCP,
474 	    tcp_listener_conf_del, NULL, {0}, {0} },
475 
476 	{ "_iss_incr", MOD_PROTO_TCP,
477 	    mod_set_uint32, mod_get_uint32,
478 	    {1, ISS_INCR, ISS_INCR},
479 	    {ISS_INCR} },
480 
481 	{ "?", MOD_PROTO_TCP, NULL, mod_get_allprop, {0}, {0} },
482 
483 	{ NULL, 0, NULL, NULL, {0}, {0} }
484 };
485 
486 int tcp_propinfo_count = A_CNT(tcp_propinfo_tbl);
487