xref: /titanic_41/usr/src/uts/common/inet/tcp/tcp_misc.c (revision ea394cb00fd96864e34d2841b4a22357b621c78f)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 
22 /*
23  * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
24  * Use is subject to license terms.
25  */
26 
27 #include <sys/types.h>
28 #include <sys/strlog.h>
29 #include <sys/policy.h>
30 #include <sys/strsun.h>
31 #include <sys/squeue_impl.h>
32 #include <sys/squeue.h>
33 
34 #include <inet/common.h>
35 #include <inet/ip.h>
36 #include <inet/tcp.h>
37 #include <inet/tcp_impl.h>
38 
39 /* Control whether TCP can enter defensive mode when under memory pressure. */
40 static boolean_t tcp_do_reclaim = B_TRUE;
41 
42 /*
43  * Routines related to the TCP_IOC_ABORT_CONN ioctl command.
44  *
45  * TCP_IOC_ABORT_CONN is a non-transparent ioctl command used for aborting
46  * TCP connections. To invoke this ioctl, a tcp_ioc_abort_conn_t structure
47  * (defined in tcp.h) needs to be filled in and passed into the kernel
48  * via an I_STR ioctl command (see streamio(7I)). The tcp_ioc_abort_conn_t
49  * structure contains the four-tuple of a TCP connection and a range of TCP
50  * states (specified by ac_start and ac_end). The use of wildcard addresses
51  * and ports is allowed. Connections with a matching four tuple and a state
52  * within the specified range will be aborted. The valid states for the
53  * ac_start and ac_end fields are in the range TCPS_SYN_SENT to TCPS_TIME_WAIT,
54  * inclusive.
55  *
56  * An application which has its connection aborted by this ioctl will receive
57  * an error that is dependent on the connection state at the time of the abort.
58  * If the connection state is < TCPS_TIME_WAIT, an application should behave as
59  * though a RST packet has been received.  If the connection state is equal to
60  * TCPS_TIME_WAIT, the 2MSL timeout will immediately be canceled by the kernel
61  * and all resources associated with the connection will be freed.
62  */
63 static mblk_t	*tcp_ioctl_abort_build_msg(tcp_ioc_abort_conn_t *, tcp_t *);
64 static void	tcp_ioctl_abort_dump(tcp_ioc_abort_conn_t *);
65 static void	tcp_ioctl_abort_handler(void *arg, mblk_t *mp, void *arg2,
66     ip_recv_attr_t *dummy);
67 static int	tcp_ioctl_abort(tcp_ioc_abort_conn_t *, tcp_stack_t *tcps);
68 void	tcp_ioctl_abort_conn(queue_t *, mblk_t *);
69 static int	tcp_ioctl_abort_bucket(tcp_ioc_abort_conn_t *, int, int *,
70     boolean_t, tcp_stack_t *);
71 
72 /*
73  * Macros used for accessing the different types of sockaddr
74  * structures inside a tcp_ioc_abort_conn_t.
75  */
76 #define	TCP_AC_V4LADDR(acp) ((sin_t *)&(acp)->ac_local)
77 #define	TCP_AC_V4RADDR(acp) ((sin_t *)&(acp)->ac_remote)
78 #define	TCP_AC_V4LOCAL(acp) (TCP_AC_V4LADDR(acp)->sin_addr.s_addr)
79 #define	TCP_AC_V4REMOTE(acp) (TCP_AC_V4RADDR(acp)->sin_addr.s_addr)
80 #define	TCP_AC_V4LPORT(acp) (TCP_AC_V4LADDR(acp)->sin_port)
81 #define	TCP_AC_V4RPORT(acp) (TCP_AC_V4RADDR(acp)->sin_port)
82 #define	TCP_AC_V6LADDR(acp) ((sin6_t *)&(acp)->ac_local)
83 #define	TCP_AC_V6RADDR(acp) ((sin6_t *)&(acp)->ac_remote)
84 #define	TCP_AC_V6LOCAL(acp) (TCP_AC_V6LADDR(acp)->sin6_addr)
85 #define	TCP_AC_V6REMOTE(acp) (TCP_AC_V6RADDR(acp)->sin6_addr)
86 #define	TCP_AC_V6LPORT(acp) (TCP_AC_V6LADDR(acp)->sin6_port)
87 #define	TCP_AC_V6RPORT(acp) (TCP_AC_V6RADDR(acp)->sin6_port)
88 
89 /*
90  * Return the correct error code to mimic the behavior
91  * of a connection reset.
92  */
93 #define	TCP_AC_GET_ERRCODE(state, err) {	\
94 		switch ((state)) {		\
95 		case TCPS_SYN_SENT:		\
96 		case TCPS_SYN_RCVD:		\
97 			(err) = ECONNREFUSED;	\
98 			break;			\
99 		case TCPS_ESTABLISHED:		\
100 		case TCPS_FIN_WAIT_1:		\
101 		case TCPS_FIN_WAIT_2:		\
102 		case TCPS_CLOSE_WAIT:		\
103 			(err) = ECONNRESET;	\
104 			break;			\
105 		case TCPS_CLOSING:		\
106 		case TCPS_LAST_ACK:		\
107 		case TCPS_TIME_WAIT:		\
108 			(err) = 0;		\
109 			break;			\
110 		default:			\
111 			(err) = ENXIO;		\
112 		}				\
113 	}
114 
115 /*
116  * Check if a tcp structure matches the info in acp.
117  */
118 #define	TCP_AC_ADDR_MATCH(acp, connp, tcp)			\
119 	(((acp)->ac_local.ss_family == AF_INET) ?		\
120 	((TCP_AC_V4LOCAL((acp)) == INADDR_ANY ||		\
121 	TCP_AC_V4LOCAL((acp)) == (connp)->conn_laddr_v4) &&	\
122 	(TCP_AC_V4REMOTE((acp)) == INADDR_ANY ||		\
123 	TCP_AC_V4REMOTE((acp)) == (connp)->conn_faddr_v4) &&	\
124 	(TCP_AC_V4LPORT((acp)) == 0 ||				\
125 	TCP_AC_V4LPORT((acp)) == (connp)->conn_lport) &&	\
126 	(TCP_AC_V4RPORT((acp)) == 0 ||				\
127 	TCP_AC_V4RPORT((acp)) == (connp)->conn_fport) &&	\
128 	(acp)->ac_start <= (tcp)->tcp_state &&			\
129 	(acp)->ac_end >= (tcp)->tcp_state) :			\
130 	((IN6_IS_ADDR_UNSPECIFIED(&TCP_AC_V6LOCAL((acp))) ||	\
131 	IN6_ARE_ADDR_EQUAL(&TCP_AC_V6LOCAL((acp)),		\
132 	&(connp)->conn_laddr_v6)) &&				\
133 	(IN6_IS_ADDR_UNSPECIFIED(&TCP_AC_V6REMOTE((acp))) ||	\
134 	IN6_ARE_ADDR_EQUAL(&TCP_AC_V6REMOTE((acp)),		\
135 	&(connp)->conn_faddr_v6)) &&				\
136 	(TCP_AC_V6LPORT((acp)) == 0 ||				\
137 	TCP_AC_V6LPORT((acp)) == (connp)->conn_lport) &&	\
138 	(TCP_AC_V6RPORT((acp)) == 0 ||				\
139 	TCP_AC_V6RPORT((acp)) == (connp)->conn_fport) &&	\
140 	(acp)->ac_start <= (tcp)->tcp_state &&			\
141 	(acp)->ac_end >= (tcp)->tcp_state))
142 
143 #define	TCP_AC_MATCH(acp, connp, tcp)				\
144 	(((acp)->ac_zoneid == ALL_ZONES ||			\
145 	(acp)->ac_zoneid == (connp)->conn_zoneid) ?		\
146 	TCP_AC_ADDR_MATCH(acp, connp, tcp) : 0)
147 
148 /*
149  * Build a message containing a tcp_ioc_abort_conn_t structure
150  * which is filled in with information from acp and tp.
151  */
152 static mblk_t *
153 tcp_ioctl_abort_build_msg(tcp_ioc_abort_conn_t *acp, tcp_t *tp)
154 {
155 	mblk_t *mp;
156 	tcp_ioc_abort_conn_t *tacp;
157 
158 	mp = allocb(sizeof (uint32_t) + sizeof (*acp), BPRI_LO);
159 	if (mp == NULL)
160 		return (NULL);
161 
162 	*((uint32_t *)mp->b_rptr) = TCP_IOC_ABORT_CONN;
163 	tacp = (tcp_ioc_abort_conn_t *)((uchar_t *)mp->b_rptr +
164 	    sizeof (uint32_t));
165 
166 	tacp->ac_start = acp->ac_start;
167 	tacp->ac_end = acp->ac_end;
168 	tacp->ac_zoneid = acp->ac_zoneid;
169 
170 	if (acp->ac_local.ss_family == AF_INET) {
171 		tacp->ac_local.ss_family = AF_INET;
172 		tacp->ac_remote.ss_family = AF_INET;
173 		TCP_AC_V4LOCAL(tacp) = tp->tcp_connp->conn_laddr_v4;
174 		TCP_AC_V4REMOTE(tacp) = tp->tcp_connp->conn_faddr_v4;
175 		TCP_AC_V4LPORT(tacp) = tp->tcp_connp->conn_lport;
176 		TCP_AC_V4RPORT(tacp) = tp->tcp_connp->conn_fport;
177 	} else {
178 		tacp->ac_local.ss_family = AF_INET6;
179 		tacp->ac_remote.ss_family = AF_INET6;
180 		TCP_AC_V6LOCAL(tacp) = tp->tcp_connp->conn_laddr_v6;
181 		TCP_AC_V6REMOTE(tacp) = tp->tcp_connp->conn_faddr_v6;
182 		TCP_AC_V6LPORT(tacp) = tp->tcp_connp->conn_lport;
183 		TCP_AC_V6RPORT(tacp) = tp->tcp_connp->conn_fport;
184 	}
185 	mp->b_wptr = (uchar_t *)mp->b_rptr + sizeof (uint32_t) + sizeof (*acp);
186 	return (mp);
187 }
188 
189 /*
190  * Print a tcp_ioc_abort_conn_t structure.
191  */
192 static void
193 tcp_ioctl_abort_dump(tcp_ioc_abort_conn_t *acp)
194 {
195 	char lbuf[128];
196 	char rbuf[128];
197 	sa_family_t af;
198 	in_port_t lport, rport;
199 	ushort_t logflags;
200 
201 	af = acp->ac_local.ss_family;
202 
203 	if (af == AF_INET) {
204 		(void) inet_ntop(af, (const void *)&TCP_AC_V4LOCAL(acp),
205 		    lbuf, 128);
206 		(void) inet_ntop(af, (const void *)&TCP_AC_V4REMOTE(acp),
207 		    rbuf, 128);
208 		lport = ntohs(TCP_AC_V4LPORT(acp));
209 		rport = ntohs(TCP_AC_V4RPORT(acp));
210 	} else {
211 		(void) inet_ntop(af, (const void *)&TCP_AC_V6LOCAL(acp),
212 		    lbuf, 128);
213 		(void) inet_ntop(af, (const void *)&TCP_AC_V6REMOTE(acp),
214 		    rbuf, 128);
215 		lport = ntohs(TCP_AC_V6LPORT(acp));
216 		rport = ntohs(TCP_AC_V6RPORT(acp));
217 	}
218 
219 	logflags = SL_TRACE | SL_NOTE;
220 	/*
221 	 * Don't print this message to the console if the operation was done
222 	 * to a non-global zone.
223 	 */
224 	if (acp->ac_zoneid == GLOBAL_ZONEID || acp->ac_zoneid == ALL_ZONES)
225 		logflags |= SL_CONSOLE;
226 	(void) strlog(TCP_MOD_ID, 0, 1, logflags,
227 	    "TCP_IOC_ABORT_CONN: local = %s:%d, remote = %s:%d, "
228 	    "start = %d, end = %d\n", lbuf, lport, rbuf, rport,
229 	    acp->ac_start, acp->ac_end);
230 }
231 
232 /*
233  * Called using SQ_FILL when a message built using
234  * tcp_ioctl_abort_build_msg is put into a queue.
235  * Note that when we get here there is no wildcard in acp any more.
236  */
237 /* ARGSUSED2 */
238 static void
239 tcp_ioctl_abort_handler(void *arg, mblk_t *mp, void *arg2,
240     ip_recv_attr_t *dummy)
241 {
242 	conn_t			*connp = (conn_t *)arg;
243 	tcp_t			*tcp = connp->conn_tcp;
244 	tcp_ioc_abort_conn_t	*acp;
245 
246 	/*
247 	 * Don't accept any input on a closed tcp as this TCP logically does
248 	 * not exist on the system. Don't proceed further with this TCP.
249 	 * For eg. this packet could trigger another close of this tcp
250 	 * which would be disastrous for tcp_refcnt. tcp_close_detached /
251 	 * tcp_clean_death / tcp_closei_local must be called at most once
252 	 * on a TCP.
253 	 */
254 	if (tcp->tcp_state == TCPS_CLOSED ||
255 	    tcp->tcp_state == TCPS_BOUND) {
256 		freemsg(mp);
257 		return;
258 	}
259 
260 	acp = (tcp_ioc_abort_conn_t *)(mp->b_rptr + sizeof (uint32_t));
261 	if (tcp->tcp_state <= acp->ac_end) {
262 		/*
263 		 * If we get here, we are already on the correct
264 		 * squeue. This ioctl follows the following path
265 		 * tcp_wput -> tcp_wput_ioctl -> tcp_ioctl_abort_conn
266 		 * ->tcp_ioctl_abort->squeue_enter (if on a
267 		 * different squeue)
268 		 */
269 		int errcode;
270 
271 		TCP_AC_GET_ERRCODE(tcp->tcp_state, errcode);
272 		(void) tcp_clean_death(tcp, errcode);
273 	}
274 	freemsg(mp);
275 }
276 
277 /*
278  * Abort all matching connections on a hash chain.
279  */
280 static int
281 tcp_ioctl_abort_bucket(tcp_ioc_abort_conn_t *acp, int index, int *count,
282     boolean_t exact, tcp_stack_t *tcps)
283 {
284 	int nmatch, err = 0;
285 	tcp_t *tcp;
286 	MBLKP mp, last, listhead = NULL;
287 	conn_t	*tconnp;
288 	connf_t	*connfp;
289 	ip_stack_t *ipst = tcps->tcps_netstack->netstack_ip;
290 
291 	connfp = &ipst->ips_ipcl_conn_fanout[index];
292 
293 startover:
294 	nmatch = 0;
295 
296 	mutex_enter(&connfp->connf_lock);
297 	for (tconnp = connfp->connf_head; tconnp != NULL;
298 	    tconnp = tconnp->conn_next) {
299 		tcp = tconnp->conn_tcp;
300 		/*
301 		 * We are missing a check on sin6_scope_id for linklocals here,
302 		 * but current usage is just for aborting based on zoneid
303 		 * for shared-IP zones.
304 		 */
305 		if (TCP_AC_MATCH(acp, tconnp, tcp)) {
306 			CONN_INC_REF(tconnp);
307 			mp = tcp_ioctl_abort_build_msg(acp, tcp);
308 			if (mp == NULL) {
309 				err = ENOMEM;
310 				CONN_DEC_REF(tconnp);
311 				break;
312 			}
313 			mp->b_prev = (mblk_t *)tcp;
314 
315 			if (listhead == NULL) {
316 				listhead = mp;
317 				last = mp;
318 			} else {
319 				last->b_next = mp;
320 				last = mp;
321 			}
322 			nmatch++;
323 			if (exact)
324 				break;
325 		}
326 
327 		/* Avoid holding lock for too long. */
328 		if (nmatch >= 500)
329 			break;
330 	}
331 	mutex_exit(&connfp->connf_lock);
332 
333 	/* Pass mp into the correct tcp */
334 	while ((mp = listhead) != NULL) {
335 		listhead = listhead->b_next;
336 		tcp = (tcp_t *)mp->b_prev;
337 		mp->b_next = mp->b_prev = NULL;
338 		SQUEUE_ENTER_ONE(tcp->tcp_connp->conn_sqp, mp,
339 		    tcp_ioctl_abort_handler, tcp->tcp_connp, NULL,
340 		    SQ_FILL, SQTAG_TCP_ABORT_BUCKET);
341 	}
342 
343 	*count += nmatch;
344 	if (nmatch >= 500 && err == 0)
345 		goto startover;
346 	return (err);
347 }
348 
349 /*
350  * Abort all connections that matches the attributes specified in acp.
351  */
352 static int
353 tcp_ioctl_abort(tcp_ioc_abort_conn_t *acp, tcp_stack_t *tcps)
354 {
355 	sa_family_t af;
356 	uint32_t  ports;
357 	uint16_t *pports;
358 	int err = 0, count = 0;
359 	boolean_t exact = B_FALSE; /* set when there is no wildcard */
360 	int index = -1;
361 	ushort_t logflags;
362 	ip_stack_t	*ipst = tcps->tcps_netstack->netstack_ip;
363 
364 	af = acp->ac_local.ss_family;
365 
366 	if (af == AF_INET) {
367 		if (TCP_AC_V4REMOTE(acp) != INADDR_ANY &&
368 		    TCP_AC_V4LPORT(acp) != 0 && TCP_AC_V4RPORT(acp) != 0) {
369 			pports = (uint16_t *)&ports;
370 			pports[1] = TCP_AC_V4LPORT(acp);
371 			pports[0] = TCP_AC_V4RPORT(acp);
372 			exact = (TCP_AC_V4LOCAL(acp) != INADDR_ANY);
373 		}
374 	} else {
375 		if (!IN6_IS_ADDR_UNSPECIFIED(&TCP_AC_V6REMOTE(acp)) &&
376 		    TCP_AC_V6LPORT(acp) != 0 && TCP_AC_V6RPORT(acp) != 0) {
377 			pports = (uint16_t *)&ports;
378 			pports[1] = TCP_AC_V6LPORT(acp);
379 			pports[0] = TCP_AC_V6RPORT(acp);
380 			exact = !IN6_IS_ADDR_UNSPECIFIED(&TCP_AC_V6LOCAL(acp));
381 		}
382 	}
383 
384 	/*
385 	 * For cases where remote addr, local port, and remote port are non-
386 	 * wildcards, tcp_ioctl_abort_bucket will only be called once.
387 	 */
388 	if (index != -1) {
389 		err = tcp_ioctl_abort_bucket(acp, index,
390 		    &count, exact, tcps);
391 	} else {
392 		/*
393 		 * loop through all entries for wildcard case
394 		 */
395 		for (index = 0;
396 		    index < ipst->ips_ipcl_conn_fanout_size;
397 		    index++) {
398 			err = tcp_ioctl_abort_bucket(acp, index,
399 			    &count, exact, tcps);
400 			if (err != 0)
401 				break;
402 		}
403 	}
404 
405 	logflags = SL_TRACE | SL_NOTE;
406 	/*
407 	 * Don't print this message to the console if the operation was done
408 	 * to a non-global zone.
409 	 */
410 	if (acp->ac_zoneid == GLOBAL_ZONEID || acp->ac_zoneid == ALL_ZONES)
411 		logflags |= SL_CONSOLE;
412 	(void) strlog(TCP_MOD_ID, 0, 1, logflags, "TCP_IOC_ABORT_CONN: "
413 	    "aborted %d connection%c\n", count, ((count > 1) ? 's' : ' '));
414 	if (err == 0 && count == 0)
415 		err = ENOENT;
416 	return (err);
417 }
418 
419 /*
420  * Process the TCP_IOC_ABORT_CONN ioctl request.
421  */
422 void
423 tcp_ioctl_abort_conn(queue_t *q, mblk_t *mp)
424 {
425 	int	err;
426 	IOCP    iocp;
427 	MBLKP   mp1;
428 	sa_family_t laf, raf;
429 	tcp_ioc_abort_conn_t *acp;
430 	zone_t		*zptr;
431 	conn_t		*connp = Q_TO_CONN(q);
432 	zoneid_t	zoneid = connp->conn_zoneid;
433 	tcp_t		*tcp = connp->conn_tcp;
434 	tcp_stack_t	*tcps = tcp->tcp_tcps;
435 
436 	iocp = (IOCP)mp->b_rptr;
437 
438 	if ((mp1 = mp->b_cont) == NULL ||
439 	    iocp->ioc_count != sizeof (tcp_ioc_abort_conn_t)) {
440 		err = EINVAL;
441 		goto out;
442 	}
443 
444 	/* check permissions */
445 	if (secpolicy_ip_config(iocp->ioc_cr, B_FALSE) != 0) {
446 		err = EPERM;
447 		goto out;
448 	}
449 
450 	if (mp1->b_cont != NULL) {
451 		freemsg(mp1->b_cont);
452 		mp1->b_cont = NULL;
453 	}
454 
455 	acp = (tcp_ioc_abort_conn_t *)mp1->b_rptr;
456 	laf = acp->ac_local.ss_family;
457 	raf = acp->ac_remote.ss_family;
458 
459 	/* check that a zone with the supplied zoneid exists */
460 	if (acp->ac_zoneid != GLOBAL_ZONEID && acp->ac_zoneid != ALL_ZONES) {
461 		zptr = zone_find_by_id(zoneid);
462 		if (zptr != NULL) {
463 			zone_rele(zptr);
464 		} else {
465 			err = EINVAL;
466 			goto out;
467 		}
468 	}
469 
470 	/*
471 	 * For exclusive stacks we set the zoneid to zero
472 	 * to make TCP operate as if in the global zone.
473 	 */
474 	if (tcps->tcps_netstack->netstack_stackid != GLOBAL_NETSTACKID)
475 		acp->ac_zoneid = GLOBAL_ZONEID;
476 
477 	if (acp->ac_start < TCPS_SYN_SENT || acp->ac_end > TCPS_TIME_WAIT ||
478 	    acp->ac_start > acp->ac_end || laf != raf ||
479 	    (laf != AF_INET && laf != AF_INET6)) {
480 		err = EINVAL;
481 		goto out;
482 	}
483 
484 	tcp_ioctl_abort_dump(acp);
485 	err = tcp_ioctl_abort(acp, tcps);
486 
487 out:
488 	if (mp1 != NULL) {
489 		freemsg(mp1);
490 		mp->b_cont = NULL;
491 	}
492 
493 	if (err != 0)
494 		miocnak(q, mp, 0, err);
495 	else
496 		miocack(q, mp, 0, 0);
497 }
498 
499 /*
500  * Timeout function to reset the TCP stack variable tcps_reclaim to false.
501  */
502 void
503 tcp_reclaim_timer(void *arg)
504 {
505 	tcp_stack_t *tcps = (tcp_stack_t *)arg;
506 	int64_t tot_conn = 0;
507 	int i;
508 	extern pgcnt_t lotsfree, needfree;
509 
510 	for (i = 0; i < tcps->tcps_sc_cnt; i++)
511 		tot_conn += tcps->tcps_sc[i]->tcp_sc_conn_cnt;
512 
513 	/*
514 	 * This happens only when a stack is going away.  tcps_reclaim_tid
515 	 * should not be reset to 0 when returning in this case.
516 	 */
517 	mutex_enter(&tcps->tcps_reclaim_lock);
518 	if (!tcps->tcps_reclaim) {
519 		mutex_exit(&tcps->tcps_reclaim_lock);
520 		return;
521 	}
522 
523 	if ((freemem >= lotsfree + needfree) || tot_conn < maxusers) {
524 		tcps->tcps_reclaim = B_FALSE;
525 		tcps->tcps_reclaim_tid = 0;
526 	} else {
527 		/* Stay in defensive mode and restart the timer */
528 		tcps->tcps_reclaim_tid = timeout(tcp_reclaim_timer,
529 		    tcps, MSEC_TO_TICK(tcps->tcps_reclaim_period));
530 	}
531 	mutex_exit(&tcps->tcps_reclaim_lock);
532 }
533 
534 /*
535  * Kmem reclaim call back function.  When the system is under memory
536  * pressure, we set the TCP stack variable tcps_reclaim to true.  This
537  * variable is reset to false after tcps_reclaim_period msecs.  During this
538  * period, TCP will be more aggressive in aborting connections not making
539  * progress, meaning retransmitting for some time (tcp_early_abort seconds).
540  * TCP will also not accept new connection request for those listeners whose
541  * q or q0 is not empty.
542  */
543 /* ARGSUSED */
544 void
545 tcp_conn_reclaim(void *arg)
546 {
547 	netstack_handle_t nh;
548 	netstack_t *ns;
549 	tcp_stack_t *tcps;
550 	extern pgcnt_t lotsfree, needfree;
551 
552 	if (!tcp_do_reclaim)
553 		return;
554 
555 	/*
556 	 * The reclaim function may be called even when the system is not
557 	 * really under memory pressure.
558 	 */
559 	if (freemem >= lotsfree + needfree)
560 		return;
561 
562 	netstack_next_init(&nh);
563 	while ((ns = netstack_next(&nh)) != NULL) {
564 		int i;
565 		int64_t tot_conn = 0;
566 
567 		/*
568 		 * During boot time, the first netstack_t is created and
569 		 * initialized before TCP has registered with the netstack
570 		 * framework.  If this reclaim function is called before TCP
571 		 * has finished its initialization, netstack_next() will
572 		 * return the first netstack_t (since its netstack_flags is
573 		 * not NSF_UNINIT).  And its netstack_tcp will be NULL.  We
574 		 * need to catch it.
575 		 *
576 		 * All subsequent netstack_t creation will not have this
577 		 * problem since the initialization is not finished until TCP
578 		 * has finished its own tcp_stack_t initialization.  Hence
579 		 * netstack_next() will not return one with NULL netstack_tcp.
580 		 */
581 		if ((tcps = ns->netstack_tcp) == NULL) {
582 			netstack_rele(ns);
583 			continue;
584 		}
585 
586 		/*
587 		 * Even if the system is under memory pressure, the reason may
588 		 * not be because of TCP activity.  Check the number of
589 		 * connections in each stack.  If the number exceeds the
590 		 * threshold (maxusers), turn on defensive mode.
591 		 */
592 		for (i = 0; i < tcps->tcps_sc_cnt; i++)
593 			tot_conn += tcps->tcps_sc[i]->tcp_sc_conn_cnt;
594 		if (tot_conn < maxusers) {
595 			netstack_rele(ns);
596 			continue;
597 		}
598 
599 		mutex_enter(&tcps->tcps_reclaim_lock);
600 		if (!tcps->tcps_reclaim) {
601 			tcps->tcps_reclaim = B_TRUE;
602 			tcps->tcps_reclaim_tid = timeout(tcp_reclaim_timer,
603 			    tcps, MSEC_TO_TICK(tcps->tcps_reclaim_period));
604 			TCP_STAT(tcps, tcp_reclaim_cnt);
605 		}
606 		mutex_exit(&tcps->tcps_reclaim_lock);
607 		netstack_rele(ns);
608 	}
609 	netstack_next_fini(&nh);
610 }
611 
612 /*
613  * Given a tcp_stack_t and a port (in host byte order), find a listener
614  * configuration for that port and return the ratio.
615  */
616 uint32_t
617 tcp_find_listener_conf(tcp_stack_t *tcps, in_port_t port)
618 {
619 	tcp_listener_t	*tl;
620 	uint32_t ratio = 0;
621 
622 	mutex_enter(&tcps->tcps_listener_conf_lock);
623 	for (tl = list_head(&tcps->tcps_listener_conf); tl != NULL;
624 	    tl = list_next(&tcps->tcps_listener_conf, tl)) {
625 		if (tl->tl_port == port) {
626 			ratio = tl->tl_ratio;
627 			break;
628 		}
629 	}
630 	mutex_exit(&tcps->tcps_listener_conf_lock);
631 	return (ratio);
632 }
633 
634 /*
635  * Ndd param helper routine to return the current list of listener limit
636  * configuration.
637  */
638 /* ARGSUSED */
639 int
640 tcp_listener_conf_get(queue_t *q, mblk_t *mp, caddr_t cp, cred_t *cr)
641 {
642 	tcp_stack_t	*tcps = Q_TO_TCP(q)->tcp_tcps;
643 	tcp_listener_t	*tl;
644 
645 	mutex_enter(&tcps->tcps_listener_conf_lock);
646 	for (tl = list_head(&tcps->tcps_listener_conf); tl != NULL;
647 	    tl = list_next(&tcps->tcps_listener_conf, tl)) {
648 		(void) mi_mpprintf(mp, "%d:%d ", tl->tl_port, tl->tl_ratio);
649 	}
650 	mutex_exit(&tcps->tcps_listener_conf_lock);
651 	return (0);
652 }
653 
654 /*
655  * Ndd param helper routine to add a new listener limit configuration.
656  */
657 /* ARGSUSED */
658 int
659 tcp_listener_conf_add(queue_t *q, mblk_t *mp, char *value, caddr_t cp,
660     cred_t *cr)
661 {
662 	tcp_listener_t	*new_tl;
663 	tcp_listener_t	*tl;
664 	long		lport;
665 	long		ratio;
666 	char		*colon;
667 	tcp_stack_t	*tcps = Q_TO_TCP(q)->tcp_tcps;
668 
669 	if (ddi_strtol(value, &colon, 10, &lport) != 0 || lport <= 0 ||
670 	    lport > USHRT_MAX || *colon != ':') {
671 		return (EINVAL);
672 	}
673 	if (ddi_strtol(colon + 1, NULL, 10, &ratio) != 0 || ratio <= 0)
674 		return (EINVAL);
675 
676 	mutex_enter(&tcps->tcps_listener_conf_lock);
677 	for (tl = list_head(&tcps->tcps_listener_conf); tl != NULL;
678 	    tl = list_next(&tcps->tcps_listener_conf, tl)) {
679 		/* There is an existing entry, so update its ratio value. */
680 		if (tl->tl_port == lport) {
681 			tl->tl_ratio = ratio;
682 			mutex_exit(&tcps->tcps_listener_conf_lock);
683 			return (0);
684 		}
685 	}
686 
687 	if ((new_tl = kmem_alloc(sizeof (tcp_listener_t), KM_NOSLEEP)) ==
688 	    NULL) {
689 		mutex_exit(&tcps->tcps_listener_conf_lock);
690 		return (ENOMEM);
691 	}
692 
693 	new_tl->tl_port = lport;
694 	new_tl->tl_ratio = ratio;
695 	list_insert_tail(&tcps->tcps_listener_conf, new_tl);
696 	mutex_exit(&tcps->tcps_listener_conf_lock);
697 	return (0);
698 }
699 
700 /*
701  * Ndd param helper routine to remove a listener limit configuration.
702  */
703 /* ARGSUSED */
704 int
705 tcp_listener_conf_del(queue_t *q, mblk_t *mp, char *value, caddr_t cp,
706     cred_t *cr)
707 {
708 	tcp_listener_t	*tl;
709 	long		lport;
710 	tcp_stack_t	*tcps = Q_TO_TCP(q)->tcp_tcps;
711 
712 	if (ddi_strtol(value, NULL, 10, &lport) != 0 || lport <= 0 ||
713 	    lport > USHRT_MAX) {
714 		return (EINVAL);
715 	}
716 	mutex_enter(&tcps->tcps_listener_conf_lock);
717 	for (tl = list_head(&tcps->tcps_listener_conf); tl != NULL;
718 	    tl = list_next(&tcps->tcps_listener_conf, tl)) {
719 		if (tl->tl_port == lport) {
720 			list_remove(&tcps->tcps_listener_conf, tl);
721 			mutex_exit(&tcps->tcps_listener_conf_lock);
722 			kmem_free(tl, sizeof (tcp_listener_t));
723 			return (0);
724 		}
725 	}
726 	mutex_exit(&tcps->tcps_listener_conf_lock);
727 	return (ESRCH);
728 }
729 
730 /*
731  * To remove all listener limit configuration in a tcp_stack_t.
732  */
733 void
734 tcp_listener_conf_cleanup(tcp_stack_t *tcps)
735 {
736 	tcp_listener_t	*tl;
737 
738 	mutex_enter(&tcps->tcps_listener_conf_lock);
739 	while ((tl = list_head(&tcps->tcps_listener_conf)) != NULL) {
740 		list_remove(&tcps->tcps_listener_conf, tl);
741 		kmem_free(tl, sizeof (tcp_listener_t));
742 	}
743 	mutex_destroy(&tcps->tcps_listener_conf_lock);
744 	list_destroy(&tcps->tcps_listener_conf);
745 }
746 
747 /*
748  * Call back function for CPU state change.
749  */
750 /* ARGSUSED */
751 int
752 tcp_cpu_update(cpu_setup_t what, int id, void *arg)
753 {
754 	cpu_t *cp;
755 	netstack_handle_t nh;
756 	netstack_t *ns;
757 	tcp_stack_t *tcps;
758 	int i;
759 
760 	ASSERT(MUTEX_HELD(&cpu_lock));
761 	cp = cpu[id];
762 
763 	switch (what) {
764 	case CPU_CONFIG:
765 	case CPU_ON:
766 	case CPU_INIT:
767 	case CPU_CPUPART_IN:
768 		netstack_next_init(&nh);
769 		while ((ns = netstack_next(&nh)) != NULL) {
770 			tcps = ns->netstack_tcp;
771 			if (cp->cpu_seqid >= tcps->tcps_sc_cnt) {
772 				for (i = tcps->tcps_sc_cnt; i <= cp->cpu_seqid;
773 				    i++) {
774 					ASSERT(tcps->tcps_sc[i] == NULL);
775 					tcps->tcps_sc[i] = kmem_zalloc(
776 					    sizeof (tcp_stats_cpu_t), KM_SLEEP);
777 				}
778 				membar_producer();
779 				tcps->tcps_sc_cnt = cp->cpu_seqid + 1;
780 			}
781 			netstack_rele(ns);
782 		}
783 		netstack_next_fini(&nh);
784 		break;
785 	case CPU_UNCONFIG:
786 	case CPU_OFF:
787 	case CPU_CPUPART_OUT:
788 		/* Nothing to do */
789 		break;
790 	default:
791 		break;
792 	}
793 	return (0);
794 }
795 
796 /*
797  * Diagnostic routine used to return a string associated with the tcp state.
798  * Note that if the caller does not supply a buffer, it will use an internal
799  * static string.  This means that if multiple threads call this function at
800  * the same time, output can be corrupted...  Note also that this function
801  * does not check the size of the supplied buffer.  The caller has to make
802  * sure that it is big enough.
803  */
804 char *
805 tcp_display(tcp_t *tcp, char *sup_buf, char format)
806 {
807 	char		buf1[30];
808 	static char	priv_buf[INET6_ADDRSTRLEN * 2 + 80];
809 	char		*buf;
810 	char		*cp;
811 	in6_addr_t	local, remote;
812 	char		local_addrbuf[INET6_ADDRSTRLEN];
813 	char		remote_addrbuf[INET6_ADDRSTRLEN];
814 	conn_t		*connp;
815 
816 	if (sup_buf != NULL)
817 		buf = sup_buf;
818 	else
819 		buf = priv_buf;
820 
821 	if (tcp == NULL)
822 		return ("NULL_TCP");
823 
824 	connp = tcp->tcp_connp;
825 	switch (tcp->tcp_state) {
826 	case TCPS_CLOSED:
827 		cp = "TCP_CLOSED";
828 		break;
829 	case TCPS_IDLE:
830 		cp = "TCP_IDLE";
831 		break;
832 	case TCPS_BOUND:
833 		cp = "TCP_BOUND";
834 		break;
835 	case TCPS_LISTEN:
836 		cp = "TCP_LISTEN";
837 		break;
838 	case TCPS_SYN_SENT:
839 		cp = "TCP_SYN_SENT";
840 		break;
841 	case TCPS_SYN_RCVD:
842 		cp = "TCP_SYN_RCVD";
843 		break;
844 	case TCPS_ESTABLISHED:
845 		cp = "TCP_ESTABLISHED";
846 		break;
847 	case TCPS_CLOSE_WAIT:
848 		cp = "TCP_CLOSE_WAIT";
849 		break;
850 	case TCPS_FIN_WAIT_1:
851 		cp = "TCP_FIN_WAIT_1";
852 		break;
853 	case TCPS_CLOSING:
854 		cp = "TCP_CLOSING";
855 		break;
856 	case TCPS_LAST_ACK:
857 		cp = "TCP_LAST_ACK";
858 		break;
859 	case TCPS_FIN_WAIT_2:
860 		cp = "TCP_FIN_WAIT_2";
861 		break;
862 	case TCPS_TIME_WAIT:
863 		cp = "TCP_TIME_WAIT";
864 		break;
865 	default:
866 		(void) mi_sprintf(buf1, "TCPUnkState(%d)", tcp->tcp_state);
867 		cp = buf1;
868 		break;
869 	}
870 	switch (format) {
871 	case DISP_ADDR_AND_PORT:
872 		if (connp->conn_ipversion == IPV4_VERSION) {
873 			/*
874 			 * Note that we use the remote address in the tcp_b
875 			 * structure.  This means that it will print out
876 			 * the real destination address, not the next hop's
877 			 * address if source routing is used.
878 			 */
879 			IN6_IPADDR_TO_V4MAPPED(connp->conn_laddr_v4, &local);
880 			IN6_IPADDR_TO_V4MAPPED(connp->conn_faddr_v4, &remote);
881 
882 		} else {
883 			local = connp->conn_laddr_v6;
884 			remote = connp->conn_faddr_v6;
885 		}
886 		(void) inet_ntop(AF_INET6, &local, local_addrbuf,
887 		    sizeof (local_addrbuf));
888 		(void) inet_ntop(AF_INET6, &remote, remote_addrbuf,
889 		    sizeof (remote_addrbuf));
890 		(void) mi_sprintf(buf, "[%s.%u, %s.%u] %s",
891 		    local_addrbuf, ntohs(connp->conn_lport), remote_addrbuf,
892 		    ntohs(connp->conn_fport), cp);
893 		break;
894 	case DISP_PORT_ONLY:
895 	default:
896 		(void) mi_sprintf(buf, "[%u, %u] %s",
897 		    ntohs(connp->conn_lport), ntohs(connp->conn_fport), cp);
898 		break;
899 	}
900 
901 	return (buf);
902 }
903