xref: /titanic_52/usr/src/uts/common/inet/tcp/tcp_misc.c (revision fcacecd0df112c67f46e2cb08c594bc199ec3386)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 
22 /*
23  * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
24  */
25 
26 #include <sys/types.h>
27 #include <sys/strlog.h>
28 #include <sys/policy.h>
29 #include <sys/strsun.h>
30 #include <sys/squeue_impl.h>
31 #include <sys/squeue.h>
32 
33 #include <inet/common.h>
34 #include <inet/ip.h>
35 #include <inet/tcp.h>
36 #include <inet/tcp_impl.h>
37 
38 /* Control whether TCP can enter defensive mode when under memory pressure. */
39 static boolean_t tcp_do_reclaim = B_TRUE;
40 
41 /*
42  * Routines related to the TCP_IOC_ABORT_CONN ioctl command.
43  *
44  * TCP_IOC_ABORT_CONN is a non-transparent ioctl command used for aborting
45  * TCP connections. To invoke this ioctl, a tcp_ioc_abort_conn_t structure
46  * (defined in tcp.h) needs to be filled in and passed into the kernel
47  * via an I_STR ioctl command (see streamio(7I)). The tcp_ioc_abort_conn_t
48  * structure contains the four-tuple of a TCP connection and a range of TCP
49  * states (specified by ac_start and ac_end). The use of wildcard addresses
50  * and ports is allowed. Connections with a matching four tuple and a state
51  * within the specified range will be aborted. The valid states for the
52  * ac_start and ac_end fields are in the range TCPS_SYN_SENT to TCPS_TIME_WAIT,
53  * inclusive.
54  *
55  * An application which has its connection aborted by this ioctl will receive
56  * an error that is dependent on the connection state at the time of the abort.
57  * If the connection state is < TCPS_TIME_WAIT, an application should behave as
58  * though a RST packet has been received.  If the connection state is equal to
59  * TCPS_TIME_WAIT, the 2MSL timeout will immediately be canceled by the kernel
60  * and all resources associated with the connection will be freed.
61  */
62 static mblk_t	*tcp_ioctl_abort_build_msg(tcp_ioc_abort_conn_t *, tcp_t *);
63 static void	tcp_ioctl_abort_dump(tcp_ioc_abort_conn_t *);
64 static void	tcp_ioctl_abort_handler(void *arg, mblk_t *mp, void *arg2,
65     ip_recv_attr_t *dummy);
66 static int	tcp_ioctl_abort(tcp_ioc_abort_conn_t *, tcp_stack_t *tcps);
67 void	tcp_ioctl_abort_conn(queue_t *, mblk_t *);
68 static int	tcp_ioctl_abort_bucket(tcp_ioc_abort_conn_t *, int, int *,
69     boolean_t, tcp_stack_t *);
70 
71 /*
72  * Macros used for accessing the different types of sockaddr
73  * structures inside a tcp_ioc_abort_conn_t.
74  */
75 #define	TCP_AC_V4LADDR(acp) ((sin_t *)&(acp)->ac_local)
76 #define	TCP_AC_V4RADDR(acp) ((sin_t *)&(acp)->ac_remote)
77 #define	TCP_AC_V4LOCAL(acp) (TCP_AC_V4LADDR(acp)->sin_addr.s_addr)
78 #define	TCP_AC_V4REMOTE(acp) (TCP_AC_V4RADDR(acp)->sin_addr.s_addr)
79 #define	TCP_AC_V4LPORT(acp) (TCP_AC_V4LADDR(acp)->sin_port)
80 #define	TCP_AC_V4RPORT(acp) (TCP_AC_V4RADDR(acp)->sin_port)
81 #define	TCP_AC_V6LADDR(acp) ((sin6_t *)&(acp)->ac_local)
82 #define	TCP_AC_V6RADDR(acp) ((sin6_t *)&(acp)->ac_remote)
83 #define	TCP_AC_V6LOCAL(acp) (TCP_AC_V6LADDR(acp)->sin6_addr)
84 #define	TCP_AC_V6REMOTE(acp) (TCP_AC_V6RADDR(acp)->sin6_addr)
85 #define	TCP_AC_V6LPORT(acp) (TCP_AC_V6LADDR(acp)->sin6_port)
86 #define	TCP_AC_V6RPORT(acp) (TCP_AC_V6RADDR(acp)->sin6_port)
87 
88 /*
89  * Return the correct error code to mimic the behavior
90  * of a connection reset.
91  */
92 #define	TCP_AC_GET_ERRCODE(state, err) {	\
93 		switch ((state)) {		\
94 		case TCPS_SYN_SENT:		\
95 		case TCPS_SYN_RCVD:		\
96 			(err) = ECONNREFUSED;	\
97 			break;			\
98 		case TCPS_ESTABLISHED:		\
99 		case TCPS_FIN_WAIT_1:		\
100 		case TCPS_FIN_WAIT_2:		\
101 		case TCPS_CLOSE_WAIT:		\
102 			(err) = ECONNRESET;	\
103 			break;			\
104 		case TCPS_CLOSING:		\
105 		case TCPS_LAST_ACK:		\
106 		case TCPS_TIME_WAIT:		\
107 			(err) = 0;		\
108 			break;			\
109 		default:			\
110 			(err) = ENXIO;		\
111 		}				\
112 	}
113 
114 /*
115  * Check if a tcp structure matches the info in acp.
116  */
117 #define	TCP_AC_ADDR_MATCH(acp, connp, tcp)			\
118 	(((acp)->ac_local.ss_family == AF_INET) ?		\
119 	((TCP_AC_V4LOCAL((acp)) == INADDR_ANY ||		\
120 	TCP_AC_V4LOCAL((acp)) == (connp)->conn_laddr_v4) &&	\
121 	(TCP_AC_V4REMOTE((acp)) == INADDR_ANY ||		\
122 	TCP_AC_V4REMOTE((acp)) == (connp)->conn_faddr_v4) &&	\
123 	(TCP_AC_V4LPORT((acp)) == 0 ||				\
124 	TCP_AC_V4LPORT((acp)) == (connp)->conn_lport) &&	\
125 	(TCP_AC_V4RPORT((acp)) == 0 ||				\
126 	TCP_AC_V4RPORT((acp)) == (connp)->conn_fport) &&	\
127 	(acp)->ac_start <= (tcp)->tcp_state &&			\
128 	(acp)->ac_end >= (tcp)->tcp_state) :			\
129 	((IN6_IS_ADDR_UNSPECIFIED(&TCP_AC_V6LOCAL((acp))) ||	\
130 	IN6_ARE_ADDR_EQUAL(&TCP_AC_V6LOCAL((acp)),		\
131 	&(connp)->conn_laddr_v6)) &&				\
132 	(IN6_IS_ADDR_UNSPECIFIED(&TCP_AC_V6REMOTE((acp))) ||	\
133 	IN6_ARE_ADDR_EQUAL(&TCP_AC_V6REMOTE((acp)),		\
134 	&(connp)->conn_faddr_v6)) &&				\
135 	(TCP_AC_V6LPORT((acp)) == 0 ||				\
136 	TCP_AC_V6LPORT((acp)) == (connp)->conn_lport) &&	\
137 	(TCP_AC_V6RPORT((acp)) == 0 ||				\
138 	TCP_AC_V6RPORT((acp)) == (connp)->conn_fport) &&	\
139 	(acp)->ac_start <= (tcp)->tcp_state &&			\
140 	(acp)->ac_end >= (tcp)->tcp_state))
141 
142 #define	TCP_AC_MATCH(acp, connp, tcp)				\
143 	(((acp)->ac_zoneid == ALL_ZONES ||			\
144 	(acp)->ac_zoneid == (connp)->conn_zoneid) ?		\
145 	TCP_AC_ADDR_MATCH(acp, connp, tcp) : 0)
146 
147 /*
148  * Build a message containing a tcp_ioc_abort_conn_t structure
149  * which is filled in with information from acp and tp.
150  */
151 static mblk_t *
152 tcp_ioctl_abort_build_msg(tcp_ioc_abort_conn_t *acp, tcp_t *tp)
153 {
154 	mblk_t *mp;
155 	tcp_ioc_abort_conn_t *tacp;
156 
157 	mp = allocb(sizeof (uint32_t) + sizeof (*acp), BPRI_LO);
158 	if (mp == NULL)
159 		return (NULL);
160 
161 	*((uint32_t *)mp->b_rptr) = TCP_IOC_ABORT_CONN;
162 	tacp = (tcp_ioc_abort_conn_t *)((uchar_t *)mp->b_rptr +
163 	    sizeof (uint32_t));
164 
165 	tacp->ac_start = acp->ac_start;
166 	tacp->ac_end = acp->ac_end;
167 	tacp->ac_zoneid = acp->ac_zoneid;
168 
169 	if (acp->ac_local.ss_family == AF_INET) {
170 		tacp->ac_local.ss_family = AF_INET;
171 		tacp->ac_remote.ss_family = AF_INET;
172 		TCP_AC_V4LOCAL(tacp) = tp->tcp_connp->conn_laddr_v4;
173 		TCP_AC_V4REMOTE(tacp) = tp->tcp_connp->conn_faddr_v4;
174 		TCP_AC_V4LPORT(tacp) = tp->tcp_connp->conn_lport;
175 		TCP_AC_V4RPORT(tacp) = tp->tcp_connp->conn_fport;
176 	} else {
177 		tacp->ac_local.ss_family = AF_INET6;
178 		tacp->ac_remote.ss_family = AF_INET6;
179 		TCP_AC_V6LOCAL(tacp) = tp->tcp_connp->conn_laddr_v6;
180 		TCP_AC_V6REMOTE(tacp) = tp->tcp_connp->conn_faddr_v6;
181 		TCP_AC_V6LPORT(tacp) = tp->tcp_connp->conn_lport;
182 		TCP_AC_V6RPORT(tacp) = tp->tcp_connp->conn_fport;
183 	}
184 	mp->b_wptr = (uchar_t *)mp->b_rptr + sizeof (uint32_t) + sizeof (*acp);
185 	return (mp);
186 }
187 
188 /*
189  * Print a tcp_ioc_abort_conn_t structure.
190  */
191 static void
192 tcp_ioctl_abort_dump(tcp_ioc_abort_conn_t *acp)
193 {
194 	char lbuf[128];
195 	char rbuf[128];
196 	sa_family_t af;
197 	in_port_t lport, rport;
198 	ushort_t logflags;
199 
200 	af = acp->ac_local.ss_family;
201 
202 	if (af == AF_INET) {
203 		(void) inet_ntop(af, (const void *)&TCP_AC_V4LOCAL(acp),
204 		    lbuf, 128);
205 		(void) inet_ntop(af, (const void *)&TCP_AC_V4REMOTE(acp),
206 		    rbuf, 128);
207 		lport = ntohs(TCP_AC_V4LPORT(acp));
208 		rport = ntohs(TCP_AC_V4RPORT(acp));
209 	} else {
210 		(void) inet_ntop(af, (const void *)&TCP_AC_V6LOCAL(acp),
211 		    lbuf, 128);
212 		(void) inet_ntop(af, (const void *)&TCP_AC_V6REMOTE(acp),
213 		    rbuf, 128);
214 		lport = ntohs(TCP_AC_V6LPORT(acp));
215 		rport = ntohs(TCP_AC_V6RPORT(acp));
216 	}
217 
218 	logflags = SL_TRACE | SL_NOTE;
219 	/*
220 	 * Don't print this message to the console if the operation was done
221 	 * to a non-global zone.
222 	 */
223 	if (acp->ac_zoneid == GLOBAL_ZONEID || acp->ac_zoneid == ALL_ZONES)
224 		logflags |= SL_CONSOLE;
225 	(void) strlog(TCP_MOD_ID, 0, 1, logflags,
226 	    "TCP_IOC_ABORT_CONN: local = %s:%d, remote = %s:%d, "
227 	    "start = %d, end = %d\n", lbuf, lport, rbuf, rport,
228 	    acp->ac_start, acp->ac_end);
229 }
230 
231 /*
232  * Called using SQ_FILL when a message built using
233  * tcp_ioctl_abort_build_msg is put into a queue.
234  * Note that when we get here there is no wildcard in acp any more.
235  */
236 /* ARGSUSED2 */
237 static void
238 tcp_ioctl_abort_handler(void *arg, mblk_t *mp, void *arg2,
239     ip_recv_attr_t *dummy)
240 {
241 	conn_t			*connp = (conn_t *)arg;
242 	tcp_t			*tcp = connp->conn_tcp;
243 	tcp_ioc_abort_conn_t	*acp;
244 
245 	/*
246 	 * Don't accept any input on a closed tcp as this TCP logically does
247 	 * not exist on the system. Don't proceed further with this TCP.
248 	 * For eg. this packet could trigger another close of this tcp
249 	 * which would be disastrous for tcp_refcnt. tcp_close_detached /
250 	 * tcp_clean_death / tcp_closei_local must be called at most once
251 	 * on a TCP.
252 	 */
253 	if (tcp->tcp_state == TCPS_CLOSED ||
254 	    tcp->tcp_state == TCPS_BOUND) {
255 		freemsg(mp);
256 		return;
257 	}
258 
259 	acp = (tcp_ioc_abort_conn_t *)(mp->b_rptr + sizeof (uint32_t));
260 	if (tcp->tcp_state <= acp->ac_end) {
261 		/*
262 		 * If we get here, we are already on the correct
263 		 * squeue. This ioctl follows the following path
264 		 * tcp_wput -> tcp_wput_ioctl -> tcp_ioctl_abort_conn
265 		 * ->tcp_ioctl_abort->squeue_enter (if on a
266 		 * different squeue)
267 		 */
268 		int errcode;
269 
270 		TCP_AC_GET_ERRCODE(tcp->tcp_state, errcode);
271 		(void) tcp_clean_death(tcp, errcode);
272 	}
273 	freemsg(mp);
274 }
275 
276 /*
277  * Abort all matching connections on a hash chain.
278  */
279 static int
280 tcp_ioctl_abort_bucket(tcp_ioc_abort_conn_t *acp, int index, int *count,
281     boolean_t exact, tcp_stack_t *tcps)
282 {
283 	int nmatch, err = 0;
284 	tcp_t *tcp;
285 	MBLKP mp, last, listhead = NULL;
286 	conn_t	*tconnp;
287 	connf_t	*connfp;
288 	ip_stack_t *ipst = tcps->tcps_netstack->netstack_ip;
289 
290 	connfp = &ipst->ips_ipcl_conn_fanout[index];
291 
292 startover:
293 	nmatch = 0;
294 
295 	mutex_enter(&connfp->connf_lock);
296 	for (tconnp = connfp->connf_head; tconnp != NULL;
297 	    tconnp = tconnp->conn_next) {
298 		tcp = tconnp->conn_tcp;
299 		/*
300 		 * We are missing a check on sin6_scope_id for linklocals here,
301 		 * but current usage is just for aborting based on zoneid
302 		 * for shared-IP zones.
303 		 */
304 		if (TCP_AC_MATCH(acp, tconnp, tcp)) {
305 			CONN_INC_REF(tconnp);
306 			mp = tcp_ioctl_abort_build_msg(acp, tcp);
307 			if (mp == NULL) {
308 				err = ENOMEM;
309 				CONN_DEC_REF(tconnp);
310 				break;
311 			}
312 			mp->b_prev = (mblk_t *)tcp;
313 
314 			if (listhead == NULL) {
315 				listhead = mp;
316 				last = mp;
317 			} else {
318 				last->b_next = mp;
319 				last = mp;
320 			}
321 			nmatch++;
322 			if (exact)
323 				break;
324 		}
325 
326 		/* Avoid holding lock for too long. */
327 		if (nmatch >= 500)
328 			break;
329 	}
330 	mutex_exit(&connfp->connf_lock);
331 
332 	/* Pass mp into the correct tcp */
333 	while ((mp = listhead) != NULL) {
334 		listhead = listhead->b_next;
335 		tcp = (tcp_t *)mp->b_prev;
336 		mp->b_next = mp->b_prev = NULL;
337 		SQUEUE_ENTER_ONE(tcp->tcp_connp->conn_sqp, mp,
338 		    tcp_ioctl_abort_handler, tcp->tcp_connp, NULL,
339 		    SQ_FILL, SQTAG_TCP_ABORT_BUCKET);
340 	}
341 
342 	*count += nmatch;
343 	if (nmatch >= 500 && err == 0)
344 		goto startover;
345 	return (err);
346 }
347 
348 /*
349  * Abort all connections that matches the attributes specified in acp.
350  */
351 static int
352 tcp_ioctl_abort(tcp_ioc_abort_conn_t *acp, tcp_stack_t *tcps)
353 {
354 	sa_family_t af;
355 	uint32_t  ports;
356 	uint16_t *pports;
357 	int err = 0, count = 0;
358 	boolean_t exact = B_FALSE; /* set when there is no wildcard */
359 	int index = -1;
360 	ushort_t logflags;
361 	ip_stack_t	*ipst = tcps->tcps_netstack->netstack_ip;
362 
363 	af = acp->ac_local.ss_family;
364 
365 	if (af == AF_INET) {
366 		if (TCP_AC_V4REMOTE(acp) != INADDR_ANY &&
367 		    TCP_AC_V4LPORT(acp) != 0 && TCP_AC_V4RPORT(acp) != 0) {
368 			pports = (uint16_t *)&ports;
369 			pports[1] = TCP_AC_V4LPORT(acp);
370 			pports[0] = TCP_AC_V4RPORT(acp);
371 			exact = (TCP_AC_V4LOCAL(acp) != INADDR_ANY);
372 		}
373 	} else {
374 		if (!IN6_IS_ADDR_UNSPECIFIED(&TCP_AC_V6REMOTE(acp)) &&
375 		    TCP_AC_V6LPORT(acp) != 0 && TCP_AC_V6RPORT(acp) != 0) {
376 			pports = (uint16_t *)&ports;
377 			pports[1] = TCP_AC_V6LPORT(acp);
378 			pports[0] = TCP_AC_V6RPORT(acp);
379 			exact = !IN6_IS_ADDR_UNSPECIFIED(&TCP_AC_V6LOCAL(acp));
380 		}
381 	}
382 
383 	/*
384 	 * For cases where remote addr, local port, and remote port are non-
385 	 * wildcards, tcp_ioctl_abort_bucket will only be called once.
386 	 */
387 	if (index != -1) {
388 		err = tcp_ioctl_abort_bucket(acp, index,
389 		    &count, exact, tcps);
390 	} else {
391 		/*
392 		 * loop through all entries for wildcard case
393 		 */
394 		for (index = 0;
395 		    index < ipst->ips_ipcl_conn_fanout_size;
396 		    index++) {
397 			err = tcp_ioctl_abort_bucket(acp, index,
398 			    &count, exact, tcps);
399 			if (err != 0)
400 				break;
401 		}
402 	}
403 
404 	logflags = SL_TRACE | SL_NOTE;
405 	/*
406 	 * Don't print this message to the console if the operation was done
407 	 * to a non-global zone.
408 	 */
409 	if (acp->ac_zoneid == GLOBAL_ZONEID || acp->ac_zoneid == ALL_ZONES)
410 		logflags |= SL_CONSOLE;
411 	(void) strlog(TCP_MOD_ID, 0, 1, logflags, "TCP_IOC_ABORT_CONN: "
412 	    "aborted %d connection%c\n", count, ((count > 1) ? 's' : ' '));
413 	if (err == 0 && count == 0)
414 		err = ENOENT;
415 	return (err);
416 }
417 
418 /*
419  * Process the TCP_IOC_ABORT_CONN ioctl request.
420  */
421 void
422 tcp_ioctl_abort_conn(queue_t *q, mblk_t *mp)
423 {
424 	int	err;
425 	IOCP    iocp;
426 	MBLKP   mp1;
427 	sa_family_t laf, raf;
428 	tcp_ioc_abort_conn_t *acp;
429 	zone_t		*zptr;
430 	conn_t		*connp = Q_TO_CONN(q);
431 	zoneid_t	zoneid = connp->conn_zoneid;
432 	tcp_t		*tcp = connp->conn_tcp;
433 	tcp_stack_t	*tcps = tcp->tcp_tcps;
434 
435 	iocp = (IOCP)mp->b_rptr;
436 
437 	if ((mp1 = mp->b_cont) == NULL ||
438 	    iocp->ioc_count != sizeof (tcp_ioc_abort_conn_t)) {
439 		err = EINVAL;
440 		goto out;
441 	}
442 
443 	/* check permissions */
444 	if (secpolicy_ip_config(iocp->ioc_cr, B_FALSE) != 0) {
445 		err = EPERM;
446 		goto out;
447 	}
448 
449 	if (mp1->b_cont != NULL) {
450 		freemsg(mp1->b_cont);
451 		mp1->b_cont = NULL;
452 	}
453 
454 	acp = (tcp_ioc_abort_conn_t *)mp1->b_rptr;
455 	laf = acp->ac_local.ss_family;
456 	raf = acp->ac_remote.ss_family;
457 
458 	/* check that a zone with the supplied zoneid exists */
459 	if (acp->ac_zoneid != GLOBAL_ZONEID && acp->ac_zoneid != ALL_ZONES) {
460 		zptr = zone_find_by_id(zoneid);
461 		if (zptr != NULL) {
462 			zone_rele(zptr);
463 		} else {
464 			err = EINVAL;
465 			goto out;
466 		}
467 	}
468 
469 	/*
470 	 * For exclusive stacks we set the zoneid to zero
471 	 * to make TCP operate as if in the global zone.
472 	 */
473 	if (tcps->tcps_netstack->netstack_stackid != GLOBAL_NETSTACKID)
474 		acp->ac_zoneid = GLOBAL_ZONEID;
475 
476 	if (acp->ac_start < TCPS_SYN_SENT || acp->ac_end > TCPS_TIME_WAIT ||
477 	    acp->ac_start > acp->ac_end || laf != raf ||
478 	    (laf != AF_INET && laf != AF_INET6)) {
479 		err = EINVAL;
480 		goto out;
481 	}
482 
483 	tcp_ioctl_abort_dump(acp);
484 	err = tcp_ioctl_abort(acp, tcps);
485 
486 out:
487 	if (mp1 != NULL) {
488 		freemsg(mp1);
489 		mp->b_cont = NULL;
490 	}
491 
492 	if (err != 0)
493 		miocnak(q, mp, 0, err);
494 	else
495 		miocack(q, mp, 0, 0);
496 }
497 
498 /*
499  * Timeout function to reset the TCP stack variable tcps_reclaim to false.
500  */
501 void
502 tcp_reclaim_timer(void *arg)
503 {
504 	tcp_stack_t *tcps = (tcp_stack_t *)arg;
505 	int64_t tot_conn = 0;
506 	int i;
507 	extern pgcnt_t lotsfree, needfree;
508 
509 	for (i = 0; i < tcps->tcps_sc_cnt; i++)
510 		tot_conn += tcps->tcps_sc[i]->tcp_sc_conn_cnt;
511 
512 	/*
513 	 * This happens only when a stack is going away.  tcps_reclaim_tid
514 	 * should not be reset to 0 when returning in this case.
515 	 */
516 	mutex_enter(&tcps->tcps_reclaim_lock);
517 	if (!tcps->tcps_reclaim) {
518 		mutex_exit(&tcps->tcps_reclaim_lock);
519 		return;
520 	}
521 
522 	if ((freemem >= lotsfree + needfree) || tot_conn < maxusers) {
523 		tcps->tcps_reclaim = B_FALSE;
524 		tcps->tcps_reclaim_tid = 0;
525 	} else {
526 		/* Stay in defensive mode and restart the timer */
527 		tcps->tcps_reclaim_tid = timeout(tcp_reclaim_timer,
528 		    tcps, MSEC_TO_TICK(tcps->tcps_reclaim_period));
529 	}
530 	mutex_exit(&tcps->tcps_reclaim_lock);
531 }
532 
533 /*
534  * Kmem reclaim call back function.  When the system is under memory
535  * pressure, we set the TCP stack variable tcps_reclaim to true.  This
536  * variable is reset to false after tcps_reclaim_period msecs.  During this
537  * period, TCP will be more aggressive in aborting connections not making
538  * progress, meaning retransmitting for some time (tcp_early_abort seconds).
539  * TCP will also not accept new connection request for those listeners whose
540  * q or q0 is not empty.
541  */
542 /* ARGSUSED */
543 void
544 tcp_conn_reclaim(void *arg)
545 {
546 	netstack_handle_t nh;
547 	netstack_t *ns;
548 	tcp_stack_t *tcps;
549 	extern pgcnt_t lotsfree, needfree;
550 
551 	if (!tcp_do_reclaim)
552 		return;
553 
554 	/*
555 	 * The reclaim function may be called even when the system is not
556 	 * really under memory pressure.
557 	 */
558 	if (freemem >= lotsfree + needfree)
559 		return;
560 
561 	netstack_next_init(&nh);
562 	while ((ns = netstack_next(&nh)) != NULL) {
563 		int i;
564 		int64_t tot_conn = 0;
565 
566 		/*
567 		 * During boot time, the first netstack_t is created and
568 		 * initialized before TCP has registered with the netstack
569 		 * framework.  If this reclaim function is called before TCP
570 		 * has finished its initialization, netstack_next() will
571 		 * return the first netstack_t (since its netstack_flags is
572 		 * not NSF_UNINIT).  And its netstack_tcp will be NULL.  We
573 		 * need to catch it.
574 		 *
575 		 * All subsequent netstack_t creation will not have this
576 		 * problem since the initialization is not finished until TCP
577 		 * has finished its own tcp_stack_t initialization.  Hence
578 		 * netstack_next() will not return one with NULL netstack_tcp.
579 		 */
580 		if ((tcps = ns->netstack_tcp) == NULL) {
581 			netstack_rele(ns);
582 			continue;
583 		}
584 
585 		/*
586 		 * Even if the system is under memory pressure, the reason may
587 		 * not be because of TCP activity.  Check the number of
588 		 * connections in each stack.  If the number exceeds the
589 		 * threshold (maxusers), turn on defensive mode.
590 		 */
591 		for (i = 0; i < tcps->tcps_sc_cnt; i++)
592 			tot_conn += tcps->tcps_sc[i]->tcp_sc_conn_cnt;
593 		if (tot_conn < maxusers) {
594 			netstack_rele(ns);
595 			continue;
596 		}
597 
598 		mutex_enter(&tcps->tcps_reclaim_lock);
599 		if (!tcps->tcps_reclaim) {
600 			tcps->tcps_reclaim = B_TRUE;
601 			tcps->tcps_reclaim_tid = timeout(tcp_reclaim_timer,
602 			    tcps, MSEC_TO_TICK(tcps->tcps_reclaim_period));
603 			TCP_STAT(tcps, tcp_reclaim_cnt);
604 		}
605 		mutex_exit(&tcps->tcps_reclaim_lock);
606 		netstack_rele(ns);
607 	}
608 	netstack_next_fini(&nh);
609 }
610 
611 /*
612  * Given a tcp_stack_t and a port (in host byte order), find a listener
613  * configuration for that port and return the ratio.
614  */
615 uint32_t
616 tcp_find_listener_conf(tcp_stack_t *tcps, in_port_t port)
617 {
618 	tcp_listener_t	*tl;
619 	uint32_t ratio = 0;
620 
621 	mutex_enter(&tcps->tcps_listener_conf_lock);
622 	for (tl = list_head(&tcps->tcps_listener_conf); tl != NULL;
623 	    tl = list_next(&tcps->tcps_listener_conf, tl)) {
624 		if (tl->tl_port == port) {
625 			ratio = tl->tl_ratio;
626 			break;
627 		}
628 	}
629 	mutex_exit(&tcps->tcps_listener_conf_lock);
630 	return (ratio);
631 }
632 
633 /*
634  * To remove all listener limit configuration in a tcp_stack_t.
635  */
636 void
637 tcp_listener_conf_cleanup(tcp_stack_t *tcps)
638 {
639 	tcp_listener_t	*tl;
640 
641 	mutex_enter(&tcps->tcps_listener_conf_lock);
642 	while ((tl = list_head(&tcps->tcps_listener_conf)) != NULL) {
643 		list_remove(&tcps->tcps_listener_conf, tl);
644 		kmem_free(tl, sizeof (tcp_listener_t));
645 	}
646 	mutex_destroy(&tcps->tcps_listener_conf_lock);
647 	list_destroy(&tcps->tcps_listener_conf);
648 }
649 
650 /*
651  * When a CPU is added, we need to allocate the per CPU stats struct.
652  */
653 void
654 tcp_stack_cpu_add(tcp_stack_t *tcps, processorid_t cpu_seqid)
655 {
656 	int i;
657 
658 	if (cpu_seqid < tcps->tcps_sc_cnt)
659 		return;
660 	for (i = tcps->tcps_sc_cnt; i <= cpu_seqid; i++) {
661 		ASSERT(tcps->tcps_sc[i] == NULL);
662 		tcps->tcps_sc[i] = kmem_zalloc(sizeof (tcp_stats_cpu_t),
663 		    KM_SLEEP);
664 	}
665 	membar_producer();
666 	tcps->tcps_sc_cnt = cpu_seqid + 1;
667 }
668 
669 /*
670  * Diagnostic routine used to return a string associated with the tcp state.
671  * Note that if the caller does not supply a buffer, it will use an internal
672  * static string.  This means that if multiple threads call this function at
673  * the same time, output can be corrupted...  Note also that this function
674  * does not check the size of the supplied buffer.  The caller has to make
675  * sure that it is big enough.
676  */
677 char *
678 tcp_display(tcp_t *tcp, char *sup_buf, char format)
679 {
680 	char		buf1[30];
681 	static char	priv_buf[INET6_ADDRSTRLEN * 2 + 80];
682 	char		*buf;
683 	char		*cp;
684 	in6_addr_t	local, remote;
685 	char		local_addrbuf[INET6_ADDRSTRLEN];
686 	char		remote_addrbuf[INET6_ADDRSTRLEN];
687 	conn_t		*connp;
688 
689 	if (sup_buf != NULL)
690 		buf = sup_buf;
691 	else
692 		buf = priv_buf;
693 
694 	if (tcp == NULL)
695 		return ("NULL_TCP");
696 
697 	connp = tcp->tcp_connp;
698 	switch (tcp->tcp_state) {
699 	case TCPS_CLOSED:
700 		cp = "TCP_CLOSED";
701 		break;
702 	case TCPS_IDLE:
703 		cp = "TCP_IDLE";
704 		break;
705 	case TCPS_BOUND:
706 		cp = "TCP_BOUND";
707 		break;
708 	case TCPS_LISTEN:
709 		cp = "TCP_LISTEN";
710 		break;
711 	case TCPS_SYN_SENT:
712 		cp = "TCP_SYN_SENT";
713 		break;
714 	case TCPS_SYN_RCVD:
715 		cp = "TCP_SYN_RCVD";
716 		break;
717 	case TCPS_ESTABLISHED:
718 		cp = "TCP_ESTABLISHED";
719 		break;
720 	case TCPS_CLOSE_WAIT:
721 		cp = "TCP_CLOSE_WAIT";
722 		break;
723 	case TCPS_FIN_WAIT_1:
724 		cp = "TCP_FIN_WAIT_1";
725 		break;
726 	case TCPS_CLOSING:
727 		cp = "TCP_CLOSING";
728 		break;
729 	case TCPS_LAST_ACK:
730 		cp = "TCP_LAST_ACK";
731 		break;
732 	case TCPS_FIN_WAIT_2:
733 		cp = "TCP_FIN_WAIT_2";
734 		break;
735 	case TCPS_TIME_WAIT:
736 		cp = "TCP_TIME_WAIT";
737 		break;
738 	default:
739 		(void) mi_sprintf(buf1, "TCPUnkState(%d)", tcp->tcp_state);
740 		cp = buf1;
741 		break;
742 	}
743 	switch (format) {
744 	case DISP_ADDR_AND_PORT:
745 		if (connp->conn_ipversion == IPV4_VERSION) {
746 			/*
747 			 * Note that we use the remote address in the tcp_b
748 			 * structure.  This means that it will print out
749 			 * the real destination address, not the next hop's
750 			 * address if source routing is used.
751 			 */
752 			IN6_IPADDR_TO_V4MAPPED(connp->conn_laddr_v4, &local);
753 			IN6_IPADDR_TO_V4MAPPED(connp->conn_faddr_v4, &remote);
754 
755 		} else {
756 			local = connp->conn_laddr_v6;
757 			remote = connp->conn_faddr_v6;
758 		}
759 		(void) inet_ntop(AF_INET6, &local, local_addrbuf,
760 		    sizeof (local_addrbuf));
761 		(void) inet_ntop(AF_INET6, &remote, remote_addrbuf,
762 		    sizeof (remote_addrbuf));
763 		(void) mi_sprintf(buf, "[%s.%u, %s.%u] %s",
764 		    local_addrbuf, ntohs(connp->conn_lport), remote_addrbuf,
765 		    ntohs(connp->conn_fport), cp);
766 		break;
767 	case DISP_PORT_ONLY:
768 	default:
769 		(void) mi_sprintf(buf, "[%u, %u] %s",
770 		    ntohs(connp->conn_lport), ntohs(connp->conn_fport), cp);
771 		break;
772 	}
773 
774 	return (buf);
775 }
776