1 /*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21
22 /*
23 * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
24 */
25
26 #include <sys/types.h>
27 #include <sys/strlog.h>
28 #include <sys/policy.h>
29 #include <sys/strsun.h>
30 #include <sys/squeue_impl.h>
31 #include <sys/squeue.h>
32
33 #include <inet/common.h>
34 #include <inet/ip.h>
35 #include <inet/tcp.h>
36 #include <inet/tcp_impl.h>
37
38 /* Control whether TCP can enter defensive mode when under memory pressure. */
39 static boolean_t tcp_do_reclaim = B_TRUE;
40
41 /*
42 * Routines related to the TCP_IOC_ABORT_CONN ioctl command.
43 *
44 * TCP_IOC_ABORT_CONN is a non-transparent ioctl command used for aborting
45 * TCP connections. To invoke this ioctl, a tcp_ioc_abort_conn_t structure
46 * (defined in tcp.h) needs to be filled in and passed into the kernel
47 * via an I_STR ioctl command (see streamio(4I)). The tcp_ioc_abort_conn_t
48 * structure contains the four-tuple of a TCP connection and a range of TCP
49 * states (specified by ac_start and ac_end). The use of wildcard addresses
50 * and ports is allowed. Connections with a matching four tuple and a state
51 * within the specified range will be aborted. The valid states for the
52 * ac_start and ac_end fields are in the range TCPS_SYN_SENT to TCPS_TIME_WAIT,
53 * inclusive.
54 *
55 * An application which has its connection aborted by this ioctl will receive
56 * an error that is dependent on the connection state at the time of the abort.
57 * If the connection state is < TCPS_TIME_WAIT, an application should behave as
58 * though a RST packet has been received. If the connection state is equal to
59 * TCPS_TIME_WAIT, the 2MSL timeout will immediately be canceled by the kernel
60 * and all resources associated with the connection will be freed.
61 */
62 static mblk_t *tcp_ioctl_abort_build_msg(tcp_ioc_abort_conn_t *, tcp_t *);
63 static void tcp_ioctl_abort_dump(tcp_ioc_abort_conn_t *);
64 static void tcp_ioctl_abort_handler(void *arg, mblk_t *mp, void *arg2,
65 ip_recv_attr_t *dummy);
66 static int tcp_ioctl_abort(tcp_ioc_abort_conn_t *, tcp_stack_t *tcps);
67 void tcp_ioctl_abort_conn(queue_t *, mblk_t *);
68 static int tcp_ioctl_abort_bucket(tcp_ioc_abort_conn_t *, int, int *,
69 boolean_t, tcp_stack_t *);
70
71 /*
72 * Macros used for accessing the different types of sockaddr
73 * structures inside a tcp_ioc_abort_conn_t.
74 */
75 #define TCP_AC_V4LADDR(acp) ((sin_t *)&(acp)->ac_local)
76 #define TCP_AC_V4RADDR(acp) ((sin_t *)&(acp)->ac_remote)
77 #define TCP_AC_V4LOCAL(acp) (TCP_AC_V4LADDR(acp)->sin_addr.s_addr)
78 #define TCP_AC_V4REMOTE(acp) (TCP_AC_V4RADDR(acp)->sin_addr.s_addr)
79 #define TCP_AC_V4LPORT(acp) (TCP_AC_V4LADDR(acp)->sin_port)
80 #define TCP_AC_V4RPORT(acp) (TCP_AC_V4RADDR(acp)->sin_port)
81 #define TCP_AC_V6LADDR(acp) ((sin6_t *)&(acp)->ac_local)
82 #define TCP_AC_V6RADDR(acp) ((sin6_t *)&(acp)->ac_remote)
83 #define TCP_AC_V6LOCAL(acp) (TCP_AC_V6LADDR(acp)->sin6_addr)
84 #define TCP_AC_V6REMOTE(acp) (TCP_AC_V6RADDR(acp)->sin6_addr)
85 #define TCP_AC_V6LPORT(acp) (TCP_AC_V6LADDR(acp)->sin6_port)
86 #define TCP_AC_V6RPORT(acp) (TCP_AC_V6RADDR(acp)->sin6_port)
87
88 /*
89 * Return the correct error code to mimic the behavior
90 * of a connection reset.
91 */
92 #define TCP_AC_GET_ERRCODE(state, err) { \
93 switch ((state)) { \
94 case TCPS_SYN_SENT: \
95 case TCPS_SYN_RCVD: \
96 (err) = ECONNREFUSED; \
97 break; \
98 case TCPS_ESTABLISHED: \
99 case TCPS_FIN_WAIT_1: \
100 case TCPS_FIN_WAIT_2: \
101 case TCPS_CLOSE_WAIT: \
102 (err) = ECONNRESET; \
103 break; \
104 case TCPS_CLOSING: \
105 case TCPS_LAST_ACK: \
106 case TCPS_TIME_WAIT: \
107 (err) = 0; \
108 break; \
109 default: \
110 (err) = ENXIO; \
111 } \
112 }
113
114 /*
115 * Check if a tcp structure matches the info in acp.
116 */
117 #define TCP_AC_ADDR_MATCH(acp, connp, tcp) \
118 (((acp)->ac_local.ss_family == AF_INET) ? \
119 ((TCP_AC_V4LOCAL((acp)) == INADDR_ANY || \
120 TCP_AC_V4LOCAL((acp)) == (connp)->conn_laddr_v4) && \
121 (TCP_AC_V4REMOTE((acp)) == INADDR_ANY || \
122 TCP_AC_V4REMOTE((acp)) == (connp)->conn_faddr_v4) && \
123 (TCP_AC_V4LPORT((acp)) == 0 || \
124 TCP_AC_V4LPORT((acp)) == (connp)->conn_lport) && \
125 (TCP_AC_V4RPORT((acp)) == 0 || \
126 TCP_AC_V4RPORT((acp)) == (connp)->conn_fport) && \
127 (acp)->ac_start <= (tcp)->tcp_state && \
128 (acp)->ac_end >= (tcp)->tcp_state) : \
129 ((IN6_IS_ADDR_UNSPECIFIED(&TCP_AC_V6LOCAL((acp))) || \
130 IN6_ARE_ADDR_EQUAL(&TCP_AC_V6LOCAL((acp)), \
131 &(connp)->conn_laddr_v6)) && \
132 (IN6_IS_ADDR_UNSPECIFIED(&TCP_AC_V6REMOTE((acp))) || \
133 IN6_ARE_ADDR_EQUAL(&TCP_AC_V6REMOTE((acp)), \
134 &(connp)->conn_faddr_v6)) && \
135 (TCP_AC_V6LPORT((acp)) == 0 || \
136 TCP_AC_V6LPORT((acp)) == (connp)->conn_lport) && \
137 (TCP_AC_V6RPORT((acp)) == 0 || \
138 TCP_AC_V6RPORT((acp)) == (connp)->conn_fport) && \
139 (acp)->ac_start <= (tcp)->tcp_state && \
140 (acp)->ac_end >= (tcp)->tcp_state))
141
142 #define TCP_AC_MATCH(acp, connp, tcp) \
143 (((acp)->ac_zoneid == ALL_ZONES || \
144 (acp)->ac_zoneid == (connp)->conn_zoneid) ? \
145 TCP_AC_ADDR_MATCH(acp, connp, tcp) : 0)
146
147 /*
148 * Build a message containing a tcp_ioc_abort_conn_t structure
149 * which is filled in with information from acp and tp.
150 */
151 static mblk_t *
tcp_ioctl_abort_build_msg(tcp_ioc_abort_conn_t * acp,tcp_t * tp)152 tcp_ioctl_abort_build_msg(tcp_ioc_abort_conn_t *acp, tcp_t *tp)
153 {
154 mblk_t *mp;
155 tcp_ioc_abort_conn_t *tacp;
156
157 mp = allocb(sizeof (uint32_t) + sizeof (*acp), BPRI_LO);
158 if (mp == NULL)
159 return (NULL);
160
161 *((uint32_t *)mp->b_rptr) = TCP_IOC_ABORT_CONN;
162 tacp = (tcp_ioc_abort_conn_t *)((uchar_t *)mp->b_rptr +
163 sizeof (uint32_t));
164
165 tacp->ac_start = acp->ac_start;
166 tacp->ac_end = acp->ac_end;
167 tacp->ac_zoneid = acp->ac_zoneid;
168
169 if (acp->ac_local.ss_family == AF_INET) {
170 tacp->ac_local.ss_family = AF_INET;
171 tacp->ac_remote.ss_family = AF_INET;
172 TCP_AC_V4LOCAL(tacp) = tp->tcp_connp->conn_laddr_v4;
173 TCP_AC_V4REMOTE(tacp) = tp->tcp_connp->conn_faddr_v4;
174 TCP_AC_V4LPORT(tacp) = tp->tcp_connp->conn_lport;
175 TCP_AC_V4RPORT(tacp) = tp->tcp_connp->conn_fport;
176 } else {
177 tacp->ac_local.ss_family = AF_INET6;
178 tacp->ac_remote.ss_family = AF_INET6;
179 TCP_AC_V6LOCAL(tacp) = tp->tcp_connp->conn_laddr_v6;
180 TCP_AC_V6REMOTE(tacp) = tp->tcp_connp->conn_faddr_v6;
181 TCP_AC_V6LPORT(tacp) = tp->tcp_connp->conn_lport;
182 TCP_AC_V6RPORT(tacp) = tp->tcp_connp->conn_fport;
183 }
184 mp->b_wptr = (uchar_t *)mp->b_rptr + sizeof (uint32_t) + sizeof (*acp);
185 return (mp);
186 }
187
188 /*
189 * Print a tcp_ioc_abort_conn_t structure.
190 */
191 static void
tcp_ioctl_abort_dump(tcp_ioc_abort_conn_t * acp)192 tcp_ioctl_abort_dump(tcp_ioc_abort_conn_t *acp)
193 {
194 char lbuf[128];
195 char rbuf[128];
196 sa_family_t af;
197 in_port_t lport, rport;
198 ushort_t logflags;
199
200 af = acp->ac_local.ss_family;
201
202 if (af == AF_INET) {
203 (void) inet_ntop(af, (const void *)&TCP_AC_V4LOCAL(acp),
204 lbuf, 128);
205 (void) inet_ntop(af, (const void *)&TCP_AC_V4REMOTE(acp),
206 rbuf, 128);
207 lport = ntohs(TCP_AC_V4LPORT(acp));
208 rport = ntohs(TCP_AC_V4RPORT(acp));
209 } else {
210 (void) inet_ntop(af, (const void *)&TCP_AC_V6LOCAL(acp),
211 lbuf, 128);
212 (void) inet_ntop(af, (const void *)&TCP_AC_V6REMOTE(acp),
213 rbuf, 128);
214 lport = ntohs(TCP_AC_V6LPORT(acp));
215 rport = ntohs(TCP_AC_V6RPORT(acp));
216 }
217
218 logflags = SL_TRACE | SL_NOTE;
219 /*
220 * Don't print this message to the console if the operation was done
221 * to a non-global zone.
222 */
223 if (acp->ac_zoneid == GLOBAL_ZONEID || acp->ac_zoneid == ALL_ZONES)
224 logflags |= SL_CONSOLE;
225 (void) strlog(TCP_MOD_ID, 0, 1, logflags,
226 "TCP_IOC_ABORT_CONN: local = %s:%d, remote = %s:%d, "
227 "start = %d, end = %d\n", lbuf, lport, rbuf, rport,
228 acp->ac_start, acp->ac_end);
229 }
230
231 /*
232 * Called using SQ_FILL when a message built using
233 * tcp_ioctl_abort_build_msg is put into a queue.
234 * Note that when we get here there is no wildcard in acp any more.
235 */
236 /* ARGSUSED2 */
237 static void
tcp_ioctl_abort_handler(void * arg,mblk_t * mp,void * arg2,ip_recv_attr_t * dummy)238 tcp_ioctl_abort_handler(void *arg, mblk_t *mp, void *arg2,
239 ip_recv_attr_t *dummy)
240 {
241 conn_t *connp = (conn_t *)arg;
242 tcp_t *tcp = connp->conn_tcp;
243 tcp_ioc_abort_conn_t *acp;
244
245 /*
246 * Don't accept any input on a closed tcp as this TCP logically does
247 * not exist on the system. Don't proceed further with this TCP.
248 * For eg. this packet could trigger another close of this tcp
249 * which would be disastrous for tcp_refcnt. tcp_close_detached /
250 * tcp_clean_death / tcp_closei_local must be called at most once
251 * on a TCP.
252 */
253 if (tcp->tcp_state == TCPS_CLOSED ||
254 tcp->tcp_state == TCPS_BOUND) {
255 freemsg(mp);
256 return;
257 }
258
259 acp = (tcp_ioc_abort_conn_t *)(mp->b_rptr + sizeof (uint32_t));
260 if (tcp->tcp_state <= acp->ac_end) {
261 /*
262 * If we get here, we are already on the correct
263 * squeue. This ioctl follows the following path
264 * tcp_wput -> tcp_wput_ioctl -> tcp_ioctl_abort_conn
265 * ->tcp_ioctl_abort->squeue_enter (if on a
266 * different squeue)
267 */
268 int errcode;
269
270 TCP_AC_GET_ERRCODE(tcp->tcp_state, errcode);
271 (void) tcp_clean_death(tcp, errcode);
272 }
273 freemsg(mp);
274 }
275
276 /*
277 * Abort all matching connections on a hash chain.
278 */
279 static int
tcp_ioctl_abort_bucket(tcp_ioc_abort_conn_t * acp,int index,int * count,boolean_t exact,tcp_stack_t * tcps)280 tcp_ioctl_abort_bucket(tcp_ioc_abort_conn_t *acp, int index, int *count,
281 boolean_t exact, tcp_stack_t *tcps)
282 {
283 int nmatch, err = 0;
284 tcp_t *tcp;
285 MBLKP mp, last, listhead = NULL;
286 conn_t *tconnp;
287 connf_t *connfp;
288 ip_stack_t *ipst = tcps->tcps_netstack->netstack_ip;
289
290 connfp = &ipst->ips_ipcl_conn_fanout[index];
291
292 startover:
293 nmatch = 0;
294 last = NULL;
295
296 mutex_enter(&connfp->connf_lock);
297 for (tconnp = connfp->connf_head; tconnp != NULL;
298 tconnp = tconnp->conn_next) {
299 tcp = tconnp->conn_tcp;
300 /*
301 * We are missing a check on sin6_scope_id for linklocals here,
302 * but current usage is just for aborting based on zoneid
303 * for shared-IP zones.
304 */
305 if (TCP_AC_MATCH(acp, tconnp, tcp)) {
306 CONN_INC_REF(tconnp);
307 mp = tcp_ioctl_abort_build_msg(acp, tcp);
308 if (mp == NULL) {
309 err = ENOMEM;
310 CONN_DEC_REF(tconnp);
311 break;
312 }
313 mp->b_prev = (mblk_t *)tcp;
314
315 if (listhead == NULL) {
316 listhead = mp;
317 last = mp;
318 } else {
319 last->b_next = mp;
320 last = mp;
321 }
322 nmatch++;
323 if (exact)
324 break;
325 }
326
327 /* Avoid holding lock for too long. */
328 if (nmatch >= 500)
329 break;
330 }
331 mutex_exit(&connfp->connf_lock);
332
333 /* Pass mp into the correct tcp */
334 while ((mp = listhead) != NULL) {
335 listhead = listhead->b_next;
336 tcp = (tcp_t *)mp->b_prev;
337 mp->b_next = mp->b_prev = NULL;
338 SQUEUE_ENTER_ONE(tcp->tcp_connp->conn_sqp, mp,
339 tcp_ioctl_abort_handler, tcp->tcp_connp, NULL,
340 SQ_FILL, SQTAG_TCP_ABORT_BUCKET);
341 }
342
343 *count += nmatch;
344 if (nmatch >= 500 && err == 0)
345 goto startover;
346 return (err);
347 }
348
349 /*
350 * Abort all connections that matches the attributes specified in acp.
351 */
352 static int
tcp_ioctl_abort(tcp_ioc_abort_conn_t * acp,tcp_stack_t * tcps)353 tcp_ioctl_abort(tcp_ioc_abort_conn_t *acp, tcp_stack_t *tcps)
354 {
355 sa_family_t af;
356 uint32_t ports;
357 uint16_t *pports;
358 int err = 0, count = 0;
359 boolean_t exact = B_FALSE; /* set when there is no wildcard */
360 int index = -1;
361 ushort_t logflags;
362 ip_stack_t *ipst = tcps->tcps_netstack->netstack_ip;
363
364 af = acp->ac_local.ss_family;
365
366 if (af == AF_INET) {
367 if (TCP_AC_V4REMOTE(acp) != INADDR_ANY &&
368 TCP_AC_V4LPORT(acp) != 0 && TCP_AC_V4RPORT(acp) != 0) {
369 pports = (uint16_t *)&ports;
370 pports[1] = TCP_AC_V4LPORT(acp);
371 pports[0] = TCP_AC_V4RPORT(acp);
372 exact = (TCP_AC_V4LOCAL(acp) != INADDR_ANY);
373 }
374 } else {
375 if (!IN6_IS_ADDR_UNSPECIFIED(&TCP_AC_V6REMOTE(acp)) &&
376 TCP_AC_V6LPORT(acp) != 0 && TCP_AC_V6RPORT(acp) != 0) {
377 pports = (uint16_t *)&ports;
378 pports[1] = TCP_AC_V6LPORT(acp);
379 pports[0] = TCP_AC_V6RPORT(acp);
380 exact = !IN6_IS_ADDR_UNSPECIFIED(&TCP_AC_V6LOCAL(acp));
381 }
382 }
383
384 /*
385 * For cases where remote addr, local port, and remote port are non-
386 * wildcards, tcp_ioctl_abort_bucket will only be called once.
387 */
388 if (index != -1) {
389 err = tcp_ioctl_abort_bucket(acp, index,
390 &count, exact, tcps);
391 } else {
392 /*
393 * loop through all entries for wildcard case
394 */
395 for (index = 0;
396 index < ipst->ips_ipcl_conn_fanout_size;
397 index++) {
398 err = tcp_ioctl_abort_bucket(acp, index,
399 &count, exact, tcps);
400 if (err != 0)
401 break;
402 }
403 }
404
405 logflags = SL_TRACE | SL_NOTE;
406 /*
407 * Don't print this message to the console if the operation was done
408 * to a non-global zone.
409 */
410 if (acp->ac_zoneid == GLOBAL_ZONEID || acp->ac_zoneid == ALL_ZONES)
411 logflags |= SL_CONSOLE;
412 (void) strlog(TCP_MOD_ID, 0, 1, logflags, "TCP_IOC_ABORT_CONN: "
413 "aborted %d connection%c\n", count, ((count > 1) ? 's' : ' '));
414 if (err == 0 && count == 0)
415 err = ENOENT;
416 return (err);
417 }
418
419 /*
420 * Process the TCP_IOC_ABORT_CONN ioctl request.
421 */
422 void
tcp_ioctl_abort_conn(queue_t * q,mblk_t * mp)423 tcp_ioctl_abort_conn(queue_t *q, mblk_t *mp)
424 {
425 int err;
426 IOCP iocp;
427 MBLKP mp1;
428 sa_family_t laf, raf;
429 tcp_ioc_abort_conn_t *acp;
430 zone_t *zptr;
431 conn_t *connp = Q_TO_CONN(q);
432 zoneid_t zoneid = connp->conn_zoneid;
433 tcp_t *tcp = connp->conn_tcp;
434 tcp_stack_t *tcps = tcp->tcp_tcps;
435
436 iocp = (IOCP)mp->b_rptr;
437
438 if ((mp1 = mp->b_cont) == NULL ||
439 iocp->ioc_count != sizeof (tcp_ioc_abort_conn_t)) {
440 err = EINVAL;
441 goto out;
442 }
443
444 /* check permissions */
445 if (secpolicy_ip_config(iocp->ioc_cr, B_FALSE) != 0) {
446 err = EPERM;
447 goto out;
448 }
449
450 if (mp1->b_cont != NULL) {
451 freemsg(mp1->b_cont);
452 mp1->b_cont = NULL;
453 }
454
455 acp = (tcp_ioc_abort_conn_t *)mp1->b_rptr;
456 laf = acp->ac_local.ss_family;
457 raf = acp->ac_remote.ss_family;
458
459 /* check that a zone with the supplied zoneid exists */
460 if (acp->ac_zoneid != GLOBAL_ZONEID && acp->ac_zoneid != ALL_ZONES) {
461 zptr = zone_find_by_id(zoneid);
462 if (zptr != NULL) {
463 zone_rele(zptr);
464 } else {
465 err = EINVAL;
466 goto out;
467 }
468 }
469
470 /*
471 * For exclusive stacks we set the zoneid to zero
472 * to make TCP operate as if in the global zone.
473 */
474 if (tcps->tcps_netstack->netstack_stackid != GLOBAL_NETSTACKID)
475 acp->ac_zoneid = GLOBAL_ZONEID;
476
477 if (acp->ac_start < TCPS_SYN_SENT || acp->ac_end > TCPS_TIME_WAIT ||
478 acp->ac_start > acp->ac_end || laf != raf ||
479 (laf != AF_INET && laf != AF_INET6)) {
480 err = EINVAL;
481 goto out;
482 }
483
484 tcp_ioctl_abort_dump(acp);
485 err = tcp_ioctl_abort(acp, tcps);
486
487 out:
488 if (mp1 != NULL) {
489 freemsg(mp1);
490 mp->b_cont = NULL;
491 }
492
493 if (err != 0)
494 miocnak(q, mp, 0, err);
495 else
496 miocack(q, mp, 0, 0);
497 }
498
499 /*
500 * Timeout function to reset the TCP stack variable tcps_reclaim to false.
501 */
502 void
tcp_reclaim_timer(void * arg)503 tcp_reclaim_timer(void *arg)
504 {
505 tcp_stack_t *tcps = (tcp_stack_t *)arg;
506 int64_t tot_conn = 0;
507 int i;
508 extern pgcnt_t lotsfree, needfree;
509
510 for (i = 0; i < tcps->tcps_sc_cnt; i++)
511 tot_conn += tcps->tcps_sc[i]->tcp_sc_conn_cnt;
512
513 /*
514 * This happens only when a stack is going away. tcps_reclaim_tid
515 * should not be reset to 0 when returning in this case.
516 */
517 mutex_enter(&tcps->tcps_reclaim_lock);
518 if (!tcps->tcps_reclaim) {
519 mutex_exit(&tcps->tcps_reclaim_lock);
520 return;
521 }
522
523 if ((freemem >= lotsfree + needfree) || tot_conn < maxusers) {
524 tcps->tcps_reclaim = B_FALSE;
525 tcps->tcps_reclaim_tid = 0;
526 } else {
527 /* Stay in defensive mode and restart the timer */
528 tcps->tcps_reclaim_tid = timeout(tcp_reclaim_timer,
529 tcps, MSEC_TO_TICK(tcps->tcps_reclaim_period));
530 }
531 mutex_exit(&tcps->tcps_reclaim_lock);
532 }
533
534 /*
535 * Kmem reclaim call back function. When the system is under memory
536 * pressure, we set the TCP stack variable tcps_reclaim to true. This
537 * variable is reset to false after tcps_reclaim_period msecs. During this
538 * period, TCP will be more aggressive in aborting connections not making
539 * progress, meaning retransmitting for some time (tcp_early_abort seconds).
540 * TCP will also not accept new connection request for those listeners whose
541 * q or q0 is not empty.
542 */
543 /* ARGSUSED */
544 void
tcp_conn_reclaim(void * arg)545 tcp_conn_reclaim(void *arg)
546 {
547 netstack_handle_t nh;
548 netstack_t *ns;
549 tcp_stack_t *tcps;
550 extern pgcnt_t lotsfree, needfree;
551
552 if (!tcp_do_reclaim)
553 return;
554
555 /*
556 * The reclaim function may be called even when the system is not
557 * really under memory pressure.
558 */
559 if (freemem >= lotsfree + needfree)
560 return;
561
562 netstack_next_init(&nh);
563 while ((ns = netstack_next(&nh)) != NULL) {
564 int i;
565 int64_t tot_conn = 0;
566
567 /*
568 * During boot time, the first netstack_t is created and
569 * initialized before TCP has registered with the netstack
570 * framework. If this reclaim function is called before TCP
571 * has finished its initialization, netstack_next() will
572 * return the first netstack_t (since its netstack_flags is
573 * not NSF_UNINIT). And its netstack_tcp will be NULL. We
574 * need to catch it.
575 *
576 * All subsequent netstack_t creation will not have this
577 * problem since the initialization is not finished until TCP
578 * has finished its own tcp_stack_t initialization. Hence
579 * netstack_next() will not return one with NULL netstack_tcp.
580 */
581 if ((tcps = ns->netstack_tcp) == NULL) {
582 netstack_rele(ns);
583 continue;
584 }
585
586 /*
587 * Even if the system is under memory pressure, the reason may
588 * not be because of TCP activity. Check the number of
589 * connections in each stack. If the number exceeds the
590 * threshold (maxusers), turn on defensive mode.
591 */
592 for (i = 0; i < tcps->tcps_sc_cnt; i++)
593 tot_conn += tcps->tcps_sc[i]->tcp_sc_conn_cnt;
594 if (tot_conn < maxusers) {
595 netstack_rele(ns);
596 continue;
597 }
598
599 mutex_enter(&tcps->tcps_reclaim_lock);
600 if (!tcps->tcps_reclaim) {
601 tcps->tcps_reclaim = B_TRUE;
602 tcps->tcps_reclaim_tid = timeout(tcp_reclaim_timer,
603 tcps, MSEC_TO_TICK(tcps->tcps_reclaim_period));
604 TCP_STAT(tcps, tcp_reclaim_cnt);
605 }
606 mutex_exit(&tcps->tcps_reclaim_lock);
607 netstack_rele(ns);
608 }
609 netstack_next_fini(&nh);
610 }
611
612 /*
613 * Given a tcp_stack_t and a port (in host byte order), find a listener
614 * configuration for that port and return the ratio.
615 */
616 uint32_t
tcp_find_listener_conf(tcp_stack_t * tcps,in_port_t port)617 tcp_find_listener_conf(tcp_stack_t *tcps, in_port_t port)
618 {
619 tcp_listener_t *tl;
620 uint32_t ratio = 0;
621
622 mutex_enter(&tcps->tcps_listener_conf_lock);
623 for (tl = list_head(&tcps->tcps_listener_conf); tl != NULL;
624 tl = list_next(&tcps->tcps_listener_conf, tl)) {
625 if (tl->tl_port == port) {
626 ratio = tl->tl_ratio;
627 break;
628 }
629 }
630 mutex_exit(&tcps->tcps_listener_conf_lock);
631 return (ratio);
632 }
633
634 /*
635 * To remove all listener limit configuration in a tcp_stack_t.
636 */
637 void
tcp_listener_conf_cleanup(tcp_stack_t * tcps)638 tcp_listener_conf_cleanup(tcp_stack_t *tcps)
639 {
640 tcp_listener_t *tl;
641
642 mutex_enter(&tcps->tcps_listener_conf_lock);
643 while ((tl = list_head(&tcps->tcps_listener_conf)) != NULL) {
644 list_remove(&tcps->tcps_listener_conf, tl);
645 kmem_free(tl, sizeof (tcp_listener_t));
646 }
647 mutex_destroy(&tcps->tcps_listener_conf_lock);
648 list_destroy(&tcps->tcps_listener_conf);
649 }
650
651 /*
652 * When a CPU is added, we need to allocate the per CPU stats struct.
653 */
654 void
tcp_stack_cpu_add(tcp_stack_t * tcps,processorid_t cpu_seqid)655 tcp_stack_cpu_add(tcp_stack_t *tcps, processorid_t cpu_seqid)
656 {
657 int i;
658
659 if (cpu_seqid < tcps->tcps_sc_cnt)
660 return;
661 for (i = tcps->tcps_sc_cnt; i <= cpu_seqid; i++) {
662 ASSERT(tcps->tcps_sc[i] == NULL);
663 tcps->tcps_sc[i] = kmem_zalloc(sizeof (tcp_stats_cpu_t),
664 KM_SLEEP);
665 }
666 membar_producer();
667 tcps->tcps_sc_cnt = cpu_seqid + 1;
668 }
669
670 /*
671 * Diagnostic routine used to return a string associated with the tcp state.
672 * Note that if the caller does not supply a buffer, it will use an internal
673 * static string. This means that if multiple threads call this function at
674 * the same time, output can be corrupted... Note also that this function
675 * does not check the size of the supplied buffer. The caller has to make
676 * sure that it is big enough.
677 */
678 char *
tcp_display(tcp_t * tcp,char * sup_buf,char format)679 tcp_display(tcp_t *tcp, char *sup_buf, char format)
680 {
681 char buf1[30];
682 static char priv_buf[INET6_ADDRSTRLEN * 2 + 80];
683 char *buf;
684 char *cp;
685 in6_addr_t local, remote;
686 char local_addrbuf[INET6_ADDRSTRLEN];
687 char remote_addrbuf[INET6_ADDRSTRLEN];
688 conn_t *connp;
689
690 if (sup_buf != NULL)
691 buf = sup_buf;
692 else
693 buf = priv_buf;
694
695 if (tcp == NULL)
696 return ("NULL_TCP");
697
698 connp = tcp->tcp_connp;
699 switch (tcp->tcp_state) {
700 case TCPS_CLOSED:
701 cp = "TCP_CLOSED";
702 break;
703 case TCPS_IDLE:
704 cp = "TCP_IDLE";
705 break;
706 case TCPS_BOUND:
707 cp = "TCP_BOUND";
708 break;
709 case TCPS_LISTEN:
710 cp = "TCP_LISTEN";
711 break;
712 case TCPS_SYN_SENT:
713 cp = "TCP_SYN_SENT";
714 break;
715 case TCPS_SYN_RCVD:
716 cp = "TCP_SYN_RCVD";
717 break;
718 case TCPS_ESTABLISHED:
719 cp = "TCP_ESTABLISHED";
720 break;
721 case TCPS_CLOSE_WAIT:
722 cp = "TCP_CLOSE_WAIT";
723 break;
724 case TCPS_FIN_WAIT_1:
725 cp = "TCP_FIN_WAIT_1";
726 break;
727 case TCPS_CLOSING:
728 cp = "TCP_CLOSING";
729 break;
730 case TCPS_LAST_ACK:
731 cp = "TCP_LAST_ACK";
732 break;
733 case TCPS_FIN_WAIT_2:
734 cp = "TCP_FIN_WAIT_2";
735 break;
736 case TCPS_TIME_WAIT:
737 cp = "TCP_TIME_WAIT";
738 break;
739 default:
740 (void) mi_sprintf(buf1, "TCPUnkState(%d)", tcp->tcp_state);
741 cp = buf1;
742 break;
743 }
744 switch (format) {
745 case DISP_ADDR_AND_PORT:
746 if (connp->conn_ipversion == IPV4_VERSION) {
747 /*
748 * Note that we use the remote address in the tcp_b
749 * structure. This means that it will print out
750 * the real destination address, not the next hop's
751 * address if source routing is used.
752 */
753 IN6_IPADDR_TO_V4MAPPED(connp->conn_laddr_v4, &local);
754 IN6_IPADDR_TO_V4MAPPED(connp->conn_faddr_v4, &remote);
755
756 } else {
757 local = connp->conn_laddr_v6;
758 remote = connp->conn_faddr_v6;
759 }
760 (void) inet_ntop(AF_INET6, &local, local_addrbuf,
761 sizeof (local_addrbuf));
762 (void) inet_ntop(AF_INET6, &remote, remote_addrbuf,
763 sizeof (remote_addrbuf));
764 (void) mi_sprintf(buf, "[%s.%u, %s.%u] %s",
765 local_addrbuf, ntohs(connp->conn_lport), remote_addrbuf,
766 ntohs(connp->conn_fport), cp);
767 break;
768 case DISP_PORT_ONLY:
769 default:
770 (void) mi_sprintf(buf, "[%u, %u] %s",
771 ntohs(connp->conn_lport), ntohs(connp->conn_fport), cp);
772 break;
773 }
774
775 return (buf);
776 }
777